shadow_rs/host/syscall/handler/
clone.rs

1use linux_api::capability::{LINUX_CAPABILITY_VERSION_3, user_cap_data, user_cap_header};
2use linux_api::errno::Errno;
3use linux_api::posix_types::kernel_pid_t;
4use linux_api::sched::CloneFlags;
5use linux_api::signal::Signal;
6use log::{debug, trace, warn};
7use shadow_shim_helper_rs::explicit_drop::ExplicitDropper;
8use shadow_shim_helper_rs::rootedcell::rc::RootedRc;
9use shadow_shim_helper_rs::rootedcell::refcell::RootedRefCell;
10use shadow_shim_helper_rs::syscall_types::ForeignPtr;
11
12use crate::host::descriptor::descriptor_table::DescriptorTable;
13use crate::host::process::ProcessId;
14use crate::host::thread::Thread;
15
16use super::{SyscallContext, SyscallHandler};
17
18impl SyscallHandler {
19    fn clone_internal(
20        ctx: &mut SyscallContext,
21        flags: CloneFlags,
22        exit_signal: Option<Signal>,
23        child_stack: ForeignPtr<()>,
24        ptid: ForeignPtr<kernel_pid_t>,
25        ctid: ForeignPtr<kernel_pid_t>,
26        newtls: u64,
27    ) -> Result<kernel_pid_t, Errno> {
28        // We use this for a consistency check to validate that we've inspected
29        // and emulated all of the provided flags.
30        let mut handled_flags = CloneFlags::empty();
31
32        // The parameters that we'll pass to the native clone call.
33        let mut native_flags = CloneFlags::empty();
34
35        // We emulate the flags that would use these, so we always pass NULL to
36        // the native call.
37        let native_ctid = ForeignPtr::<kernel_pid_t>::null();
38        let native_ptid = ForeignPtr::<kernel_pid_t>::null();
39
40        // We use the managed-code provided stack.
41        let native_child_stack = child_stack;
42
43        // We use the managed-code provided newtls.
44        let native_newtls = newtls;
45
46        if flags.contains(CloneFlags::CLONE_THREAD) {
47            // From clone(2):
48            // > Since Linux 2.5.35, the flags mask must also include
49            // > CLONE_SIGHAND if CLONE_THREAD is specified
50            if !flags.contains(CloneFlags::CLONE_SIGHAND) {
51                debug!("Missing CLONE_SIGHAND");
52                return Err(Errno::EINVAL);
53            }
54            if !flags.contains(CloneFlags::CLONE_SETTLS) {
55                // Legal in Linux, but the shim will be broken and behave unpredictably.
56                warn!("CLONE_THREAD without CLONE_TLS not supported by shadow");
57                return Err(Errno::ENOTSUP);
58            }
59            if exit_signal.is_some() {
60                warn!("Exit signal is unimplemented");
61                return Err(Errno::ENOTSUP);
62            }
63            // The native clone call will:
64            // - create a thread.
65            native_flags.insert(CloneFlags::CLONE_THREAD);
66            // - share signal handlers (mandatory anyway)
67            native_flags.insert(CloneFlags::CLONE_SIGHAND);
68            // - share file system info (mostly N/A for shadow, but conventional for threads)
69            native_flags.insert(CloneFlags::CLONE_FS);
70            // - share file descriptors
71            native_flags.insert(CloneFlags::CLONE_FILES);
72            // - share semaphores (mostly N/A for shadow, but conventional for threads)
73            native_flags.insert(CloneFlags::CLONE_SYSVSEM);
74
75            handled_flags.insert(CloneFlags::CLONE_THREAD);
76        } else {
77            if ctx.objs.process.memory_borrow().has_mapper() {
78                warn!("Fork with memory mapper unimplemented");
79                return Err(Errno::ENOTSUP);
80            }
81            // Make shadow the parent process
82            native_flags.insert(CloneFlags::CLONE_PARENT);
83        }
84
85        if flags.contains(CloneFlags::CLONE_SIGHAND) {
86            // From clone(2):
87            // > Since Linux 2.6.0, the flags mask must also include CLONE_VM if
88            // > CLONE_SIGHAND is specified
89            if !flags.contains(CloneFlags::CLONE_VM) {
90                debug!("Missing CLONE_VM");
91                return Err(Errno::EINVAL);
92            }
93            // Currently a no-op since threads always share signal handlers,
94            // and we don't yet support non-CLONE_THREAD.
95            handled_flags.insert(CloneFlags::CLONE_SIGHAND);
96        }
97
98        if flags.contains(CloneFlags::CLONE_FS) {
99            // Currently a no-op since we don't support the related
100            // metadata and syscalls that this affects (e.g. chroot).
101            handled_flags.insert(CloneFlags::CLONE_FS);
102        }
103
104        let desc_table = if flags.contains(CloneFlags::CLONE_FILES) {
105            // Child gets a reference to the same table.
106            RootedRc::clone(ctx.objs.thread.descriptor_table(), ctx.objs.host.root())
107        } else {
108            // Child gets a *copy* of the table.
109            let root = ctx.objs.host.root();
110            let table: DescriptorTable = ctx
111                .objs
112                .thread
113                .descriptor_table_borrow(ctx.objs.host)
114                .clone();
115            RootedRc::new(root, RootedRefCell::new(root, table))
116        };
117        let desc_table = ExplicitDropper::new(desc_table, |desc_table| {
118            desc_table.explicit_drop_recursive(ctx.objs.host.root(), ctx.objs.host);
119        });
120        handled_flags.insert(CloneFlags::CLONE_FILES);
121
122        if flags.contains(CloneFlags::CLONE_SETTLS) {
123            native_flags.insert(CloneFlags::CLONE_SETTLS);
124            handled_flags.insert(CloneFlags::CLONE_SETTLS);
125        }
126
127        if flags.contains(CloneFlags::CLONE_VFORK) {
128            // *Typically* `CLONE_VFORK|CLONE_VM` is used as a "faster fork", and
129            // ignoring it will still work as intended.
130            //
131            // In principle this might not be true if the managed program
132            // actually uses the shared memory with the parent process as a
133            // "feature" and e.g. writes to non-scratch memory, expecting the
134            // parent process to see those writes when it resumes.
135            warn_once_then_debug!(
136                "Ignoring CLONE_VFORK (and CLONE_VM if set). In *typical* usage this won't \
137                result in incorrect behavior."
138            );
139            handled_flags.insert(CloneFlags::CLONE_VFORK);
140        }
141
142        if flags.contains(CloneFlags::CLONE_VM) {
143            if flags.contains(CloneFlags::CLONE_THREAD) {
144                native_flags.insert(CloneFlags::CLONE_VM);
145            } else if flags.contains(CloneFlags::CLONE_VFORK) {
146                // We already handled (warned) about this above.
147            } else {
148                // Haven't seen this in practice.
149                //
150                // Unclear that it'd be safe to ignore. Lack of CLONE_VFORK
151                // (which normally pauses the parent until the child exec's or
152                // exits) implies that this that the child may exist for more
153                // than a brief window before exec'ing.
154                warn!("CLONE_VM without CLONE_THREAD and without CLONE_VFORK unsupported");
155                return Err(Errno::ENOTSUP);
156            }
157            handled_flags.insert(CloneFlags::CLONE_VM);
158        }
159
160        if flags.contains(CloneFlags::CLONE_SYSVSEM) {
161            // Currently a no-op since we don't support sysv semaphores.
162            handled_flags.insert(CloneFlags::CLONE_SYSVSEM);
163        }
164
165        // Handled after native clone
166        let do_parent_settid = flags.contains(CloneFlags::CLONE_PARENT_SETTID);
167        handled_flags.insert(CloneFlags::CLONE_PARENT_SETTID);
168
169        // Handled after native clone
170        let do_child_settid = flags.contains(CloneFlags::CLONE_CHILD_SETTID);
171        handled_flags.insert(CloneFlags::CLONE_CHILD_SETTID);
172
173        // Handled after native clone
174        let do_child_cleartid = flags.contains(CloneFlags::CLONE_CHILD_CLEARTID);
175        handled_flags.insert(CloneFlags::CLONE_CHILD_CLEARTID);
176
177        let do_copy_sighandlers = if flags.contains(CloneFlags::CLONE_CLEAR_SIGHAND) {
178            // clone(2): Specifying this flag together with CLONE_SIGHAND is
179            // nonsensical and disallowed.
180            if flags.contains(CloneFlags::CLONE_SIGHAND) {
181                return Err(Errno::EINVAL);
182            }
183            false
184        } else {
185            // We only need to copy if they're not shared.
186            !flags.contains(CloneFlags::CLONE_SIGHAND)
187        };
188        handled_flags.insert(CloneFlags::CLONE_CLEAR_SIGHAND);
189
190        if flags.contains(CloneFlags::CLONE_PARENT) {
191            // Handled in `new_forked_process` when creating a new process.
192            // No-op when not creating a new process.
193            handled_flags.insert(CloneFlags::CLONE_PARENT);
194        }
195
196        let unhandled_flags = flags.difference(handled_flags);
197        if !unhandled_flags.is_empty() {
198            warn!("Unhandled clone flags: {unhandled_flags:?}");
199            return Err(Errno::ENOTSUP);
200        }
201
202        let child_mthread = ctx.objs.thread.mthread().native_clone(
203            ctx.objs,
204            native_flags,
205            native_child_stack,
206            native_ptid,
207            native_ctid,
208            native_newtls,
209        )?;
210
211        let child_tid = ctx.objs.host.get_new_thread_id();
212        let child_pid = if flags.contains(CloneFlags::CLONE_THREAD) {
213            ctx.objs.process.id()
214        } else {
215            ProcessId::from(child_tid)
216        };
217
218        let child_thread = Thread::wrap_mthread(
219            ctx.objs.host,
220            child_mthread,
221            desc_table.into_value(),
222            child_pid,
223            child_tid,
224        )?;
225
226        let childrc = ExplicitDropper::new(
227            RootedRc::new(
228                ctx.objs.host.root(),
229                RootedRefCell::new(ctx.objs.host.root(), child_thread),
230            ),
231            |childrc| {
232                childrc.explicit_drop_recursive(ctx.objs.host.root(), ctx.objs.host);
233            },
234        );
235
236        let child_process_rc;
237        let child_process_borrow;
238        let child_process;
239        if flags.contains(CloneFlags::CLONE_THREAD) {
240            child_process_borrow = None;
241            child_process = ctx.objs.process;
242            ctx.objs
243                .process
244                .add_thread(ctx.objs.host, childrc.into_value());
245        } else {
246            let process = ctx
247                .objs
248                .process
249                .borrow_as_runnable()
250                .unwrap()
251                .new_forked_process(ctx.objs.host, flags, exit_signal, childrc.into_value());
252            child_process_rc = Some(ExplicitDropper::new(
253                process.clone(ctx.objs.host.root()),
254                |x| {
255                    x.explicit_drop_recursive(ctx.objs.host.root(), ctx.objs.host);
256                },
257            ));
258            child_process_borrow = Some(
259                child_process_rc
260                    .as_ref()
261                    .unwrap()
262                    .borrow(ctx.objs.host.root()),
263            );
264            child_process = child_process_borrow.as_ref().unwrap();
265            ctx.objs
266                .host
267                .add_and_schedule_forked_process(ctx.objs.host, process);
268        }
269
270        if do_parent_settid {
271            ctx.objs
272                .process
273                .memory_borrow_mut()
274                .write(ptid, &kernel_pid_t::from(child_tid))?;
275        }
276
277        if do_child_settid {
278            // Set the child thread id in the child's memory.
279            child_process
280                .memory_borrow_mut()
281                .write(ctid, &kernel_pid_t::from(child_tid))?;
282        }
283
284        if do_child_cleartid {
285            let childrc = child_process.thread_borrow(child_tid).unwrap();
286            let child = childrc.borrow(ctx.objs.host.root());
287            child.set_tid_address(ctid);
288        }
289
290        if do_copy_sighandlers {
291            let shmem_lock = ctx.objs.host.shim_shmem_lock_borrow_mut().unwrap();
292
293            let parent_shmem = ctx.objs.process.shmem();
294            let parent_shmem_prot = parent_shmem.protected.borrow(&shmem_lock.root);
295
296            let child_shmem = child_process_borrow.as_ref().unwrap().shmem();
297            let mut child_shmem_prot = child_shmem.protected.borrow_mut(&shmem_lock.root);
298            // Safety: pointers in the parent are valid in the child.
299            unsafe { child_shmem_prot.clone_signal_actions(&parent_shmem_prot) };
300        }
301
302        Ok(kernel_pid_t::from(child_tid))
303    }
304
305    // Note that the syscall args are different than the libc wrapper.
306    // See "C library/kernel differences" in clone(2).
307    log_syscall!(
308        clone,
309        /* rv */ kernel_pid_t,
310        /* flags */ CloneFlags,
311        /* child_stack */ *const std::ffi::c_void,
312        /* ptid */ *const kernel_pid_t,
313        /* ctid */ *const kernel_pid_t,
314        /* newtls */ *const std::ffi::c_void,
315    );
316    pub fn clone(
317        ctx: &mut SyscallContext,
318        flags_and_exit_signal: i32,
319        child_stack: ForeignPtr<()>,
320        ptid: ForeignPtr<kernel_pid_t>,
321        ctid: ForeignPtr<kernel_pid_t>,
322        newtls: u64,
323    ) -> Result<kernel_pid_t, Errno> {
324        let raw_flags = flags_and_exit_signal as u32 & !0xff;
325        let raw_exit_signal = (flags_and_exit_signal as u32 & 0xff) as i32;
326
327        let Some(flags) = CloneFlags::from_bits(raw_flags as u64) else {
328            debug!("Couldn't parse clone flags: {raw_flags:x}");
329            return Err(Errno::EINVAL);
330        };
331
332        let exit_signal = if raw_exit_signal == 0 {
333            None
334        } else {
335            let Ok(exit_signal) = Signal::try_from(raw_exit_signal) else {
336                debug!("Bad exit signal: {raw_exit_signal:?}");
337                return Err(Errno::EINVAL);
338            };
339            Some(exit_signal)
340        };
341
342        Self::clone_internal(ctx, flags, exit_signal, child_stack, ptid, ctid, newtls)
343    }
344
345    log_syscall!(
346        clone3,
347        /* rv */ kernel_pid_t,
348        /* args*/ *const linux_api::sched::clone_args,
349        /* args_size*/ usize,
350    );
351    pub fn clone3(
352        ctx: &mut SyscallContext,
353        args: ForeignPtr<linux_api::sched::clone_args>,
354        args_size: usize,
355    ) -> Result<kernel_pid_t, Errno> {
356        if args_size != std::mem::size_of::<linux_api::sched::clone_args>() {
357            // TODO: allow smaller size, and be careful to only read
358            // as much as the caller specified, and zero-fill the rest.
359            return Err(Errno::EINVAL);
360        }
361        let args = ctx.objs.process.memory_borrow().read(args)?;
362        trace!("clone3 args: {args:?}");
363        let Some(flags) = CloneFlags::from_bits(args.flags) else {
364            debug!("Couldn't parse clone flags: {:x}", args.flags);
365            return Err(Errno::EINVAL);
366        };
367        let exit_signal = if args.exit_signal == 0 {
368            None
369        } else {
370            let Ok(exit_signal) = Signal::try_from(args.exit_signal as i32) else {
371                debug!("Bad signal number: {}", args.exit_signal);
372                return Err(Errno::EINVAL);
373            };
374            Some(exit_signal)
375        };
376        Self::clone_internal(
377            ctx,
378            flags,
379            exit_signal,
380            ForeignPtr::<()>::from(args.stack + args.stack_size),
381            ForeignPtr::<kernel_pid_t>::from_raw_ptr(args.parent_tid as *mut kernel_pid_t),
382            ForeignPtr::<kernel_pid_t>::from_raw_ptr(args.child_tid as *mut kernel_pid_t),
383            args.tls,
384        )
385    }
386
387    log_syscall!(fork, /* rv */ kernel_pid_t);
388    pub fn fork(ctx: &mut SyscallContext) -> Result<kernel_pid_t, Errno> {
389        // This should be the correct call to `clone_internal`, but `clone_internal`
390        // will currently return an error.
391        Self::clone_internal(
392            ctx,
393            CloneFlags::empty(),
394            Some(Signal::SIGCHLD),
395            ForeignPtr::<()>::null(),
396            ForeignPtr::<kernel_pid_t>::null(),
397            ForeignPtr::<kernel_pid_t>::null(),
398            0,
399        )
400    }
401
402    log_syscall!(vfork, /* rv */ kernel_pid_t);
403    pub fn vfork(ctx: &mut SyscallContext) -> Result<kernel_pid_t, Errno> {
404        // This should be the correct call to `clone_internal`, but `clone_internal`
405        // will currently return an error.
406        Self::clone_internal(
407            ctx,
408            CloneFlags::CLONE_VFORK | CloneFlags::CLONE_VM,
409            Some(Signal::SIGCHLD),
410            ForeignPtr::<()>::null(),
411            ForeignPtr::<kernel_pid_t>::null(),
412            ForeignPtr::<kernel_pid_t>::null(),
413            0,
414        )
415    }
416
417    log_syscall!(gettid, /* rv */ kernel_pid_t);
418    pub fn gettid(ctx: &mut SyscallContext) -> Result<kernel_pid_t, Errno> {
419        Ok(kernel_pid_t::from(ctx.objs.thread.id()))
420    }
421
422    log_syscall!(
423        capget,
424        /* rv */ std::ffi::c_int,
425        /* hdrp */ *const std::ffi::c_void,
426        /* datap */ *const std::ffi::c_void,
427    );
428    pub fn capget(
429        ctx: &mut SyscallContext,
430        hdrp: ForeignPtr<user_cap_header>,
431        datap: ForeignPtr<[user_cap_data; 2]>,
432    ) -> Result<(), Errno> {
433        // If the version is not 3, we return the error
434        let hdrp = ctx.objs.process.memory_borrow().read(hdrp)?;
435        if hdrp.version != LINUX_CAPABILITY_VERSION_3 {
436            warn_once_then_debug!(
437                "The version of Linux capabilities is not supported ({})",
438                hdrp.version
439            );
440            return Err(Errno::EINVAL);
441        }
442
443        if !datap.is_null() {
444            // Since we don't provide any capability to the managed plugin, we return zeroes to both
445            // datap[0] and datap[1]
446            let empty = user_cap_data {
447                effective: 0,
448                permitted: 0,
449                inheritable: 0,
450            };
451            ctx.objs
452                .process
453                .memory_borrow_mut()
454                .write(datap, &[empty, empty])?;
455        }
456        Ok(())
457    }
458
459    log_syscall!(
460        capset,
461        /* rv */ std::ffi::c_int,
462        /* hdrp */ *const std::ffi::c_void,
463        /* datap */ *const std::ffi::c_void,
464    );
465    pub fn capset(
466        ctx: &mut SyscallContext,
467        hdrp: ForeignPtr<user_cap_header>,
468        datap: ForeignPtr<[user_cap_data; 2]>,
469    ) -> Result<(), Errno> {
470        // If the version is not 3, we return the error
471        let hdrp = ctx.objs.process.memory_borrow().read(hdrp)?;
472        if hdrp.version != LINUX_CAPABILITY_VERSION_3 {
473            warn_once_then_debug!(
474                "The version of Linux capabilities is not supported ({})",
475                hdrp.version
476            );
477            return Err(Errno::EINVAL);
478        }
479
480        let datap: [_; 2] = ctx.objs.process.memory_borrow().read(datap)?;
481        for data in &datap {
482            // We don't allow the plugin to set any capability
483            if data.effective != 0 || data.permitted != 0 || data.inheritable != 0 {
484                warn_once_then_debug!("Setting Linux capabilities is not supported");
485                return Err(Errno::EINVAL);
486            }
487        }
488        Ok(())
489    }
490}