pal/unix/process/
linux.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! Linux process spawning.
5
6use super::Builder;
7use super::Child;
8use super::FdOp;
9use super::SandboxFailureMode;
10use crate::unix::SyscallResult;
11use crate::unix::errno;
12use caps::CapsHashSet;
13use landlock::RulesetCreated;
14use seccompiler::SeccompFilter;
15use std::ffi::CStr;
16use std::ffi::CString;
17use std::io;
18use std::os::unix::prelude::*;
19
20struct CloneContext<'a> {
21    executable: &'a CStr,
22    argv: &'a [*const libc::c_char],
23    envp: &'a [*const libc::c_char],
24    result: Option<i32>,
25    // TODO: refactor this to contain BorrowedFds
26    fd_ops: &'a mut [(i32, FdOp)],
27    sandbox_failure_mode: SandboxFailureMode,
28    setsid: bool,
29    controlling_terminal: Option<BorrowedFd<'a>>,
30    uid: Option<libc::uid_t>,
31    gid: Option<libc::uid_t>,
32    permitted_capabilities: Option<CapsHashSet>,
33    effective_capabilities: Option<CapsHashSet>,
34    ambient_capabilities: Option<CapsHashSet>,
35    inheritable_capabilities: Option<CapsHashSet>,
36    bounding_capabilities: Option<CapsHashSet>,
37    landlock_rules: Option<RulesetCreated>,
38    seccomp_filter: Option<SeccompFilter>,
39}
40
41impl Builder<'_> {
42    pub(super) fn spawn_internal(
43        &self,
44        envp: &[CString],
45        fd_ops: &mut [(i32, FdOp)],
46    ) -> io::Result<Child> {
47        let mut landlock_rules = None;
48        if let Some(lr) = &self.linux_builder.landlock_rules {
49            landlock_rules = Some(lr.try_clone()?);
50        }
51
52        // Build the null-terminated arrays for exec.
53        let argv = super::c_slice_to_pointers(&self.argv);
54        let envp = super::c_slice_to_pointers(envp);
55
56        let mut context = CloneContext {
57            executable: &self.executable,
58            argv: &argv,
59            envp: &envp,
60            result: None,
61            fd_ops: &mut *fd_ops,
62            sandbox_failure_mode: self.linux_builder.sandbox_failure_mode,
63            setsid: self.linux_builder.setsid,
64            controlling_terminal: self.linux_builder.controlling_terminal,
65            uid: self.uid,
66            gid: self.gid,
67            permitted_capabilities: self.linux_builder.permitted_capabilities.clone(),
68            effective_capabilities: self.linux_builder.effective_capabilities.clone(),
69            inheritable_capabilities: self.linux_builder.inheritable_capabilities.clone(),
70            ambient_capabilities: self.linux_builder.ambient_capabilities.clone(),
71            bounding_capabilities: self.linux_builder.bounding_capabilities.clone(),
72            landlock_rules,
73            seccomp_filter: self.linux_builder.seccomp_filter.clone(),
74        };
75
76        // Use CLONE_VM and CLONE_VFORK so that the new process will share the
77        // current address space and will block this thread until it either
78        // exits or calls exec.
79        //
80        // Use CLONE_PIDFD to get an fd back to use for polling.
81        let mut flags = self.linux_builder.clone_flags | libc::CLONE_PIDFD | libc::SIGCHLD;
82
83        if self.linux_builder.vfork {
84            flags |= libc::CLONE_VM | libc::CLONE_VFORK;
85        }
86
87        // SAFETY: sysconf has no safety requirements.
88        let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) } as usize;
89
90        // Common page sizes are 4KiB, 16KiB, and 64KiB. The stack size must be a multiple
91        // of the page size.
92        let stack_len: usize = std::cmp::max(16 * 1024, page_size);
93        assert!(stack_len % page_size == 0);
94
95        // Create a stack with one guard page.
96        let stack_len = stack_len + page_size;
97        // SAFETY: creating a new mapping, which has no safety requirements.
98        let stack = unsafe {
99            libc::mmap(
100                std::ptr::null_mut(),
101                stack_len,
102                libc::PROT_READ | libc::PROT_WRITE,
103                libc::MAP_PRIVATE | libc::MAP_ANONYMOUS,
104                -1,
105                0,
106            )
107        };
108        if stack == libc::MAP_FAILED {
109            return Err(errno().into());
110        }
111        let mmap_guard = ChildStackGuard(stack, stack_len);
112        // SAFETY: The stack has been checked to be valid, and its length is more than one page.
113        unsafe { libc::mprotect(stack, page_size, libc::PROT_NONE) }.syscall_result()?;
114        let mut pidfd: libc::pid_t = -1;
115
116        // SAFETY: The stack is valid for stack len, if the child goes off the
117        // stack they'll hit our guard page, the flags include PIDFD so passing
118        // pidfd is valid, and clone_cb takes a CloneContext pointer as its only
119        // argument.
120        let pid = unsafe {
121            libc::clone(
122                clone_cb,
123                stack.add(stack_len),
124                flags,
125                std::ptr::from_mut(&mut context).cast(),
126                &mut pidfd,
127            )
128        }
129        .syscall_result()?;
130        drop(mmap_guard);
131
132        // SAFETY: We set the PIDFD flag, and clone returned successfully, so pidfd is now valid.
133        let pidfd = unsafe { OwnedFd::from_raw_fd(pidfd) };
134        let mut child = Child {
135            pid,
136            pidfd,
137            status: None,
138        };
139
140        // This can only be done if we are vforking, without sharing another
141        // type of status object we can't determine if the execve failed or
142        // the process failed during early initialization.
143        if self.linux_builder.vfork && context.result != Some(0) {
144            // The new process failed without successfully calling execve. Reap
145            // it and return the associated error code (which may come from
146            // context or from the exit code).
147            let status = child.wait().unwrap();
148            let ec = context.result.unwrap_or_else(|| {
149                status
150                    .code()
151                    .expect("child should not have failed with a signal")
152            });
153            return Err(io::Error::from_raw_os_error(ec));
154        }
155
156        Ok(child)
157    }
158}
159
160struct ChildStackGuard(*mut libc::c_void, usize);
161
162impl Drop for ChildStackGuard {
163    fn drop(&mut self) {
164        // SAFETY: We know the pointer is valid and the length is correct at
165        // construction, and we know the child is not running anymore, so it's
166        // safe to unmap the stack.
167        unsafe { libc::munmap(self.0, self.1) }
168            .syscall_result()
169            .unwrap();
170    }
171}
172
173/// Runs in the cloned process to set up the process environment and exec the
174/// new binary.
175///
176/// This function must not use the heap or call any functions that might. It
177/// also has only a small amount of stack space available. It should avoid using
178/// OS functionality via the std crate and should use libc directly.
179///
180/// Returns the exit code for the new process. If this function does not update
181/// context's result, then the exit code will be the Linux errno value
182/// associated with the error.
183//
184// N.B. this should be unsafe but the libc crate neglected to mark the clone
185// callback appropriately.
186extern "C" fn clone_cb(context: *mut libc::c_void) -> libc::c_int {
187    // SAFETY: Context is temporarily owned by this function, and we know
188    // we were passed a valid pointer.
189    let context = unsafe { &mut *(context.cast::<CloneContext<'_>>()) };
190
191    if context.setsid {
192        // SAFETY: setsid has no safety requirements.
193        if unsafe { libc::setsid() } < 0 {
194            return errno().0;
195        }
196    }
197
198    if let Some(fd) = context.controlling_terminal {
199        // SAFETY: fd is guaranteed to be valid.
200        if unsafe { libc::ioctl(fd.as_raw_fd(), libc::TIOCSCTTY, 0) } < 0 {
201            return errno().0;
202        }
203    }
204
205    // Find the maximum newfd, needed below.
206    let maxfd = context.fd_ops.iter().map(|(fd, _)| *fd).max();
207
208    if let Some(maxfd) = maxfd {
209        for (newfd, op) in &mut *context.fd_ops {
210            match op {
211                FdOp::Close => {}
212                FdOp::Dup(oldfd) => {
213                    // Ensure oldfd is above the maximum newfd. This is
214                    // necessary to ensure that another operation does not close
215                    // an oldfd targeted by this operation.
216                    if oldfd != newfd && *oldfd < maxfd {
217                        // SAFETY: fd is guaranteed to be valid
218                        let new_oldfd =
219                            unsafe { libc::fcntl(*oldfd, libc::F_DUPFD_CLOEXEC, maxfd) };
220                        if new_oldfd < 0 {
221                            return errno().0;
222                        }
223                        *oldfd = new_oldfd;
224                    }
225                }
226            }
227        }
228
229        for (newfd, op) in &*context.fd_ops {
230            match op {
231                FdOp::Close => {
232                    // SAFETY: fd is guaranteed to be valid
233                    if unsafe { libc::close(*newfd) } < 0 {
234                        return errno().0;
235                    }
236                }
237                FdOp::Dup(oldfd) => {
238                    if *newfd == *oldfd {
239                        // SAFETY: fd is guaranteed to be valid
240                        if unsafe {
241                            libc::fcntl(
242                                *oldfd,
243                                libc::F_SETFD,
244                                libc::fcntl(*oldfd, libc::F_GETFD) & !libc::FD_CLOEXEC,
245                            )
246                        } < 0
247                        {
248                            return errno().0;
249                        }
250                    } else {
251                        // SAFETY: fds are guaranteed to be valid
252                        if unsafe { libc::dup2(*oldfd, *newfd) } < 0 {
253                            return errno().0;
254                        }
255                    }
256                }
257            }
258        }
259    }
260
261    macro_rules! handle_sandbox_failure {
262        ($m:expr, $r:expr) => {
263            match context.sandbox_failure_mode {
264                SandboxFailureMode::Silent => {}
265                SandboxFailureMode::Warn => {
266                    tracing::warn!($m);
267                }
268                SandboxFailureMode::Error => {
269                    tracing::error!($m);
270                    return $r;
271                }
272            }
273        };
274    }
275
276    if let Some(landlock_rules) = context.landlock_rules.take() {
277        if landlock_rules.restrict_self().is_err() {
278            handle_sandbox_failure!("failed to apply landlock ruleset", libc::ENOTSUP);
279        }
280    }
281
282    if let Some(gid) = context.gid {
283        // SAFETY: setresgid has no safety requirements.
284        if unsafe { libc::setresgid(gid, gid, gid) } < 0 {
285            handle_sandbox_failure!("failed to change group id", libc::ENOTSUP);
286        }
287    }
288
289    if let Some(uid) = context.uid {
290        // SAFETY: setresuid has no safety requirements.
291        if unsafe { libc::setresuid(uid, uid, uid) } < 0 {
292            handle_sandbox_failure!("failed to change user id", libc::ENOTSUP);
293        }
294    }
295
296    macro_rules! set_capabilities {
297        ($t:expr, $v:ident) => {
298            if let Some($v) = &context.$v {
299                if caps::set(None, $t, &$v).is_err() {
300                    handle_sandbox_failure!(
301                        std::concat!("failed to apply ", stringify!($t), " capabilities"),
302                        libc::ENOTSUP
303                    );
304                }
305            }
306        };
307    }
308
309    set_capabilities!(caps::CapSet::Bounding, bounding_capabilities);
310    set_capabilities!(caps::CapSet::Permitted, permitted_capabilities);
311    set_capabilities!(caps::CapSet::Ambient, ambient_capabilities);
312    set_capabilities!(caps::CapSet::Inheritable, inheritable_capabilities);
313    set_capabilities!(caps::CapSet::Effective, effective_capabilities);
314
315    if let Some(seccomp_filter) = context.seccomp_filter.take() {
316        if let Ok(bpf_program) = TryInto::<seccompiler::BpfProgram>::try_into(seccomp_filter) {
317            if seccompiler::apply_filter(&bpf_program).is_err() {
318                handle_sandbox_failure!("failed to apply seccomp profile", libc::ENOTSUP);
319            }
320        }
321    }
322
323    // Update the result indicating success in case execvpe does not return.
324    context.result = Some(0);
325    // N.B. This will only return on error.
326    // SAFETY: Arguments in the context are valid CStrings, and the two arrays
327    // are properly null terminated.
328    unsafe {
329        libc::execvpe(
330            context.executable.as_ptr(),
331            context.argv.as_ptr(),
332            context.envp.as_ptr(),
333        )
334    };
335    // Update the result with the failure code.
336    context.result = Some(errno().0);
337    255
338}
339
340impl AsFd for Child {
341    fn as_fd(&self) -> BorrowedFd<'_> {
342        self.pidfd.as_fd()
343    }
344}