underhill_init/
lib.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! This module implements the the Underhill initial process.
5
6#![cfg(target_os = "linux")]
7#![expect(missing_docs)]
8// UNSAFETY: Calling libc functions to set up global system state.
9#![expect(unsafe_code)]
10
11mod options;
12mod syslog;
13
14// `pub` so that the missing_docs warning fires for options without
15// documentation.
16pub use options::Options;
17
18use anyhow::Context;
19use libc::STDERR_FILENO;
20use libc::STDIN_FILENO;
21use libc::STDOUT_FILENO;
22use libc::c_void;
23use std::collections::HashMap;
24use std::ffi::CStr;
25use std::ffi::OsStr;
26use std::io;
27use std::io::BufRead;
28use std::io::BufReader;
29use std::io::Write;
30use std::os::unix::prelude::*;
31use std::path::Path;
32use std::process::Child;
33use std::process::Command;
34use std::process::ExitStatus;
35use std::process::Stdio;
36use std::time::Duration;
37use syslog::SysLog;
38use walkdir::WalkDir;
39
40const UNDERHILL_PATH: &str = "/bin/openvmm_hcl";
41
42struct FilesystemMount<'a> {
43    source: &'a CStr,
44    target: &'a CStr,
45    fstype: &'a CStr,
46    options: &'a CStr,
47    flags: u64,
48}
49
50impl<'a> FilesystemMount<'a> {
51    pub fn new(
52        source: &'a CStr,
53        target: &'a CStr,
54        fstype: &'a CStr,
55        flags: u64,
56        options: &'a CStr,
57    ) -> Self {
58        Self {
59            source,
60            target,
61            fstype,
62            options,
63            flags,
64        }
65    }
66
67    pub fn mount(&self) -> io::Result<()> {
68        // SAFETY: calling the API according to the documentation
69        let err = unsafe {
70            libc::mount(
71                self.source.as_ptr(),
72                self.target.as_ptr(),
73                self.fstype.as_ptr(),
74                self.flags,
75                self.options.as_ptr().cast::<c_void>(),
76            )
77        };
78
79        if err != 0 {
80            Err(io::Error::last_os_error())
81        } else {
82            Ok(())
83        }
84    }
85}
86
87mod dev_random_ioctls {
88    pub const MAX_ENTROPY_SIZE: usize = 256;
89    const RANDOM_IOC_MAGIC: u8 = b'R';
90    #[repr(C)]
91    pub struct RndAddEntropy {
92        pub entropy_count: i32,
93        pub buf_size: i32,
94        pub buf: [u8; MAX_ENTROPY_SIZE],
95    }
96    // RNDADDENTROPY _IOW( 'R', 0x03, int [2] )
97    nix::ioctl_write_ptr_bad!(
98        rnd_add_entropy_ioctl,
99        nix::request_code_write!(RANDOM_IOC_MAGIC, 0x3, size_of::<std::os::raw::c_int>() * 2),
100        RndAddEntropy
101    );
102}
103
104// If it is available, use host-generated entropy to speed up boot and
105// improve entropy quality.
106//
107// This is especially useful on machines without hardware random number
108// generation. When random numbers are requested from /dev/random, the
109// system blocks until it has gained enough entropy.
110//
111// It is safe to apply host-provided entropy even to a confidential VM,
112// because host-provided data is hashed into the existing entropy sources.
113// However, we don't know if the entropy from the host can be trusted.
114// Therefore we don't want to increase the entropy count in case the kernel
115// has not already filled its entropy pool via safe means, so we just write
116// to /dev/random instead of using rnd_add_entropy_ioctl.
117fn use_host_entropy() -> anyhow::Result<()> {
118    use dev_random_ioctls::MAX_ENTROPY_SIZE;
119
120    let host_entropy = match fs_err::read("/proc/device-tree/openhcl/entropy/reg") {
121        Ok(contents) => contents,
122        Err(e) => {
123            log::warn!("Did not get entropy from the host: {e:#}");
124            return Ok(());
125        }
126    };
127
128    if host_entropy.len() > MAX_ENTROPY_SIZE {
129        log::warn!(
130            "Truncating host-provided entropy (received {} bytes)",
131            host_entropy.len()
132        );
133    }
134    let use_entropy_bytes = std::cmp::min(host_entropy.len(), MAX_ENTROPY_SIZE);
135    log::info!("Using {} bytes of entropy from the host", use_entropy_bytes);
136
137    let mut entropy = dev_random_ioctls::RndAddEntropy {
138        entropy_count: (use_entropy_bytes * 8) as i32,
139        buf_size: use_entropy_bytes as i32,
140        buf: [0; MAX_ENTROPY_SIZE],
141    };
142    entropy.buf[..use_entropy_bytes].copy_from_slice(&host_entropy[..use_entropy_bytes]);
143
144    let mut dev_random = fs_err::OpenOptions::new()
145        .write(true)
146        .open("/dev/random")
147        .with_context(|| ("failed to open dev random for setting entropy").to_string())?;
148
149    if underhill_confidentiality::is_confidential_vm() {
150        // Just write to /dev/random (and don't increase entropy count)
151        dev_random
152            .write_all(&entropy.buf[..use_entropy_bytes])
153            .context("write to /dev/random")?;
154    } else {
155        // Write to /dev/random and increase the entropy count
156        // so that we can speed up boot when the host entropy can be trusted.
157        // SAFETY: API called according to the documentation.
158        unsafe {
159            dev_random_ioctls::rnd_add_entropy_ioctl(dev_random.as_raw_fd(), &entropy)
160                .context("rnd_add_entropy_ioctl")?;
161        }
162    }
163
164    Ok(())
165}
166
167fn setup(
168    stat_files: &[&str],
169    options: &Options,
170    writes: &[(&str, &str)],
171    filesystems: &[FilesystemMount<'_>],
172) -> anyhow::Result<()> {
173    log::info!("Mounting filesystems");
174
175    for filesystem in filesystems {
176        let path: &Path = OsStr::from_bytes(filesystem.target.to_bytes()).as_ref();
177        // Ensure the target exists.
178        fs_err::create_dir_all(path)?;
179
180        filesystem
181            .mount()
182            .with_context(|| format!("failed to mount {}", path.display()))?;
183    }
184
185    log::info!("Command line args: {:?}", options);
186
187    if log::log_enabled!(log::Level::Trace) {
188        for stat_file in stat_files {
189            if let Ok(file) = fs_err::File::open(stat_file) {
190                log::trace!("{}", stat_file);
191                for line in BufReader::new(file).lines() {
192                    if let Ok(line) = line {
193                        log::trace!("{}", line);
194                    }
195                }
196            }
197        }
198    }
199
200    log::info!("Setting system resource limits and parameters");
201
202    for (path, data) in writes {
203        fs_err::write(path, data).with_context(|| format!("failed to write {data}"))?;
204    }
205
206    use_host_entropy().context("use host entropy")?;
207
208    Ok(())
209}
210
211fn run_setup_scripts(scripts: &[String]) -> anyhow::Result<Vec<(String, String)>> {
212    let mut new_env = Vec::new();
213    for setup in scripts {
214        log::info!("Running provided setup script {}", setup);
215
216        let result = Command::new("/bin/sh")
217            .arg("-c")
218            .arg(setup)
219            .stderr(Stdio::inherit())
220            .output()
221            .context("script failed to start")?;
222
223        if !result.status.success() {
224            anyhow::bail!("setup script failed: {}", result.status);
225        }
226
227        // Capture key-value pairs in the script's stdout as environment
228        // variables.
229        for line in result.stdout.split(|&x| x == b'\n') {
230            if let Some((key, value)) = std::str::from_utf8(line)
231                .ok()
232                .and_then(|line| line.split_once('='))
233            {
234                log::info!("setting env var {}={}", key, value);
235                new_env.push((key.into(), value.into()));
236            }
237        }
238    }
239    Ok(new_env)
240}
241
242fn run(options: &Options, env: impl IntoIterator<Item = (String, String)>) -> anyhow::Result<()> {
243    let mut command = Command::new(UNDERHILL_PATH);
244    command.arg("--pid").arg("/run/underhill.pid");
245    command.args(&options.underhill_args);
246    command.envs(env);
247
248    // Update the file descriptor limit for the main process, since large VMs
249    // require lots of fds. There is no downside to a larger value except that
250    // we may less effectively catch fd leaks (which have not historically been
251    // a problem). So use a value that is plenty large enough for any VM.
252    let limit = 0x100000;
253    // SAFETY: calling according to docs.
254    unsafe {
255        if libc::prlimit(
256            0,
257            libc::RLIMIT_NOFILE,
258            &libc::rlimit {
259                rlim_cur: limit,
260                rlim_max: limit,
261            },
262            std::ptr::null_mut(),
263        ) < 0
264        {
265            return Err(io::Error::last_os_error()).context("failed to update rlimit");
266        }
267    }
268
269    log::info!("running {:?}", &command);
270
271    let child = command.spawn().context("underhill failed to start")?;
272
273    let status = reap_until(child).context("wait failed")?;
274    if status.success() {
275        log::info!("underhill exited successfully");
276    } else {
277        log::error!("underhill terminated unsuccessfully: {}", status);
278    }
279
280    std::process::exit(status.code().unwrap_or(255));
281}
282
283/// Reap zombie processes until `child` exits. Return `child`'s exit status.
284fn reap_until(child: Child) -> io::Result<ExitStatus> {
285    loop {
286        let mut status = 0;
287        // SAFETY: calling according to docs.
288        let pid = unsafe { libc::wait(&mut status) };
289        if pid < 0 {
290            return Err(io::Error::last_os_error());
291        }
292
293        if pid == child.id() as i32 {
294            // The child process died. Pass through the exit status.
295            return Ok(ExitStatus::from_raw(status));
296        }
297    }
298}
299
300fn move_stdio(src: impl Into<std::fs::File>, dst: RawFd) {
301    assert!((0..=2).contains(&dst));
302    let src = src.into();
303    if src.as_raw_fd() != dst {
304        // SAFETY: calling as documented.
305        let r = unsafe { libc::dup2(src.as_raw_fd(), dst) };
306        assert_eq!(r, dst);
307    } else {
308        let _ = src.into_raw_fd();
309    }
310}
311
312fn init_logging() {
313    // Open /dev/null for replacing stdin and stdout.
314    move_stdio(fs_err::File::open("/dev/null").unwrap(), STDIN_FILENO);
315
316    move_stdio(
317        fs_err::OpenOptions::new()
318            .write(true)
319            .open("/dev/null")
320            .unwrap(),
321        STDOUT_FILENO,
322    );
323
324    // Set stderr to /dev/ttyprintk to catch panic stack.
325    let ttyprintk_err = match fs_err::OpenOptions::new()
326        .write(true)
327        .open("/dev/ttyprintk")
328    {
329        Ok(ttyprintk) => {
330            move_stdio(ttyprintk, STDERR_FILENO);
331            None
332        }
333        Err(err) => Some(err),
334    };
335
336    // Set the log output to use /dev/kmsg directly.
337    let syslog = SysLog::new().expect("failed to open /dev/kmsg");
338    log::set_boxed_logger(Box::new(syslog)).expect("no logger already set");
339
340    // TODO: syslog should respect the OPENVMM_LOG env variable to allow runtime
341    // log level changes without rebuilding, but for now downgrade the default
342    // to info to stop noisy logs and allow compile time changes for local
343    // debugging.
344    log::set_max_level(log::LevelFilter::Info);
345
346    // Now that logging is initialized, fail if opening ttyprintk failed.
347    // Otherwise, we probably won't see the failure reason in the logs.
348    if let Some(err) = ttyprintk_err {
349        log::error!("failed to open stderr output: {}", err);
350        panic!();
351    }
352}
353
354fn load_modules(modules_path: &str) -> anyhow::Result<()> {
355    // Get the kernel command line.
356    let cmdline = fs_err::read_to_string("/proc/cmdline")?;
357    let mut params = HashMap::new();
358    for option in cmdline.split_ascii_whitespace() {
359        if let Some((module, option)) = option.split_once('.') {
360            if option.contains('=') {
361                let v: &mut String = params.entry(module.replace('-', "_")).or_default();
362                *v += option;
363                *v += " ";
364            }
365        }
366    }
367
368    // Load the modules.
369    for module in WalkDir::new(modules_path).sort_by_file_name() {
370        let module = module?;
371        if !module.file_type().is_file() {
372            continue;
373        }
374
375        let module = module.path();
376        let module_name = module
377            .file_stem()
378            .unwrap()
379            .to_str()
380            .unwrap()
381            .replace('-', "_");
382
383        let params = params.get_mut(&module_name);
384
385        log::info!(
386            "loading kernel module {}: {}",
387            module.display(),
388            params.as_ref().map_or("", |s| s.as_str())
389        );
390        let file = fs_err::File::open(module).context("failed to open module")?;
391
392        let params = if let Some(params) = params {
393            // Null terminate
394            params.pop();
395            params.push('\0');
396            params.as_bytes()
397        } else {
398            b"\0"
399        };
400
401        // SAFETY: calling the syscall as documented. Of course, the module
402        // being loaded has full kernel privileges, but the contents of the file
403        // system are trusted.
404        let r =
405            unsafe { libc::syscall(libc::SYS_finit_module, file.as_raw_fd(), params.as_ptr(), 0) };
406        if r < 0 {
407            return Err(io::Error::last_os_error())
408                .with_context(|| format!("failed to load module {}", module.display()));
409        }
410
411        log::info!("load complete for {}", module.display());
412    }
413
414    // Once the kernel modules are loaded into memory, the module files are not needed anymore.
415    // By deleting them after, we can save some memory.
416    fs_err::remove_dir_all(modules_path)?;
417
418    Ok(())
419}
420
421fn timestamp() -> u64 {
422    let mut tp;
423    // SAFETY: calling `clock_gettime` as documented.
424    unsafe {
425        tp = std::mem::zeroed();
426        libc::clock_gettime(libc::CLOCK_BOOTTIME, &mut tp);
427    }
428    Duration::new(tp.tv_sec as u64, tp.tv_nsec as u32).as_nanos() as u64
429}
430
431fn do_main() -> anyhow::Result<()> {
432    let boot_time = timestamp();
433
434    init_logging();
435
436    log::info!(
437        "Initial process: crate_name={}, crate_revision={}, crate_branch={}",
438        env!("CARGO_PKG_NAME"),
439        option_env!("VERGEN_GIT_SHA").unwrap_or("UNKNOWN_REVISION"),
440        option_env!("VERGEN_GIT_BRANCH").unwrap_or("UNKNOWN_BRANCH"),
441    );
442
443    let stat_files = [
444        "/proc/uptime",
445        "/proc/timer_list",
446        "/proc/interrupts",
447        "/proc/meminfo",
448        "/proc/iomem",
449        "/proc/ioports",
450        "/proc/sys/kernel/pid_max",
451        "/proc/sys/kernel/threads-max",
452        "/proc/sys/vm/max_map_count",
453    ];
454    let options = Options::parse();
455    let writes = &[
456        // The kernel sets the maximum number of threads to a number
457        // inferred from the size of RAM: the thread structures must
458        // occupy only 1/8th of the available RAM pages. That is quite
459        // small for Underhill in the interactive mode so the kernel
460        // would allow only a small number of threads which doesn't
461        // let the interactive mode run.
462        ("/proc/sys/kernel/threads-max", "32768"),
463        // Censor kernel pointers in the logs for security
464        ("/proc/sys/kernel/kptr_restrict", "1"),
465        // Enable transparent hugepages on requested VMAs. This is used to map
466        // VTL0 memory with huge pages. Although this is on by default in our
467        // kernel configuration, the kernel turns it off for low-memory systems
468        // (which VTL2 is).
469        ("/sys/kernel/mm/transparent_hugepage/enabled", "madvise"),
470        // Configure the vmbus devices to be handled as user-mode vmbus
471        // driver.
472        (
473            "/sys/bus/vmbus/drivers/uio_hv_generic/new_id",
474            // GET
475            "8dedd1aa-9056-49e4-bfd6-1bf90dc38ef0",
476        ),
477        (
478            "/sys/bus/vmbus/drivers/uio_hv_generic/new_id",
479            // UART
480            "8b60ccf6-709f-4c11-90b5-229c959a9e6a",
481        ),
482        (
483            "/sys/bus/vmbus/drivers/uio_hv_generic/new_id",
484            // Crashdump
485            "427b03e7-4ceb-4286-b5fc-486f4a1dd439",
486        ),
487        (
488            "/proc/sys/kernel/core_pattern",
489            if underhill_confidentiality::confidential_filtering_enabled() {
490                // Disable the processing of dumps for CVMs.
491                ""
492            } else {
493                // When a user mode crash occurs, the kernel will call `/bin/underhill-crash`
494                // passing the information of the crashing process to it.
495                // The order of these arguments must match exactly with the order
496                // that underhill_crash is expecting.
497                "|/bin/underhill-crash %p %i %s %e"
498            },
499        ),
500        // Handle one crashing process at a time.
501        ("/proc/sys/kernel/core_pipe_limit", "1"),
502        // Don't bother OOM killing processes when out of memory, just panic.
503        // Any unexpected process termination is a fatal error anyway, so panic
504        // to get a VM crash dump.
505        ("/proc/sys/vm/panic_on_oom", "1"),
506        // Set the min watermark to 1MiB, the minimum value recommended in
507        // Documentation/admin-guide/sysctl/vm.rst (Linux kernel). This controls kswapd.
508        // kswapd reclaims memory by swapping or dropping reclaimable caches when the
509        // number of free pages in a zone is below the low watermark.
510        // VTL2 has no swap and has no reclaimable caches, so there is nothing it can do
511        // if it is invoked. By setting the watermarks as low as possible, we
512        // ensure that it won't be invoked in normal operation (if it does get invoked, the system
513        // is probably about to OOM anyway).
514        // This also indirectly controls the size of the percpu pagesets.
515        // We want to keep that size as small as possible without introducing contention on the
516        // zone lock, as these pages are:
517        // * Not counted in MemFree of /proc/meminfo
518        // * Not considered when determining if kswapd should be started
519        ("/proc/sys/vm/min_free_kbytes", "1024"),
520        // Make the high and low watermark as close to the min watermark as possible. This value's
521        // units are fractions of 10,000. This means the watermarks will be spaced 0.01% of available
522        // memory apart.
523        ("/proc/sys/vm/watermark_scale_factor", "1"),
524        // Disable the watermark boost feature
525        ("/proc/sys/vm/watermark_boost_factor", "0"),
526    ];
527    let filesystems = [
528        FilesystemMount::new(
529            c"proc",
530            c"/proc",
531            c"proc",
532            libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC | libc::MS_RELATIME,
533            c"",
534        ),
535        FilesystemMount::new(
536            c"sysfs",
537            c"/sys",
538            c"sysfs",
539            libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC | libc::MS_RELATIME,
540            c"",
541        ),
542        FilesystemMount::new(
543            c"dev",
544            c"/dev",
545            c"devtmpfs",
546            libc::MS_NOSUID | libc::MS_NOEXEC | libc::MS_RELATIME,
547            c"",
548        ),
549        FilesystemMount::new(
550            c"devpts",
551            c"/dev/pts",
552            c"devpts",
553            libc::MS_NOSUID | libc::MS_NOEXEC | libc::MS_RELATIME,
554            c"",
555        ),
556    ];
557
558    setup(&stat_files, &options, writes, &filesystems)?;
559    let mut new_env = run_setup_scripts(&options.setup_script)?;
560    new_env.push(("KERNEL_BOOT_TIME".into(), boot_time.to_string()));
561
562    if matches!(
563        std::env::var("OPENHCL_NVME_VFIO").as_deref(),
564        Ok("true" | "1")
565    ) {
566        // Register VFIO to bind to all NVMe devices, from any vendor.
567        //
568        // Since nvme is loaded as a module, and that happens after this call,
569        // this will take precedence over the in-kernel nvme driver.
570        fs_err::write(
571            "/sys/bus/pci/drivers/vfio-pci/new_id",
572            "ffffffff ffffffff ffffffff ffffffff 010802 ffffff",
573        )
574        .context("failed to register nvme for vfio")?;
575        log::info!("registered vfio-pci as driver for nvme");
576    }
577
578    // Start loading modules in parallel.
579    let thread = std::thread::spawn(|| {
580        if let Err(err) = load_modules("/lib/modules") {
581            panic!("failed to load modules: {:#}", err);
582        }
583    });
584    if std::env::var("OPENHCL_WAIT_FOR_MODULES").as_deref() == Ok("1") {
585        thread.join().unwrap();
586    }
587
588    run(&options, new_env)
589}
590
591pub fn main() -> ! {
592    match do_main() {
593        Ok(_) => unreachable!(),
594        Err(err) => {
595            log::error!("fatal: {:#}", err);
596            std::process::exit(1);
597        }
598    }
599}