shadow_rs/host/
managed_thread.rs

1//! A thread of a managed process.
2//!
3//! This contains the code where the simulator can create or communicate with a managed process.
4
5use std::cell::{Cell, RefCell};
6use std::ffi::{CStr, CString};
7use std::io::Write;
8use std::os::fd::AsRawFd;
9use std::os::unix::prelude::OsStrExt;
10use std::path::PathBuf;
11use std::sync::{Arc, atomic};
12
13use linux_api::errno::Errno;
14use linux_api::posix_types::Pid;
15use linux_api::sched::CloneFlags;
16use linux_api::signal::tgkill;
17use log::{Level, debug, error, log_enabled, trace};
18use rand::Rng as _;
19use rustix::pipe::PipeFlags;
20use rustix::process::WaitOptions;
21use shadow_shim_helper_rs::ipc::IPCData;
22use shadow_shim_helper_rs::shim_event::{
23    ShimEventAddThreadReq, ShimEventAddThreadRes, ShimEventStartRes, ShimEventSyscall,
24    ShimEventSyscallComplete, ShimEventToShadow, ShimEventToShim,
25};
26use shadow_shim_helper_rs::syscall_types::{ForeignPtr, SyscallArgs, SyscallReg};
27use shadow_shmem::allocator::ShMemBlock;
28use vasi_sync::scchannel::SelfContainedChannelError;
29
30use super::context::ThreadContext;
31use super::host::Host;
32use super::syscall::condition::SyscallCondition;
33use crate::core::worker::{WORKER_SHARED, Worker};
34use crate::cshadow;
35use crate::host::syscall::handler::SyscallHandler;
36use crate::host::syscall::types::{ForeignArrayPtr, SyscallReturn};
37use crate::utility::{VerifyPluginPathError, inject_preloads, syscall, verify_plugin_path};
38
39/// The ManagedThread's state after having been allowed to execute some code.
40#[derive(Debug)]
41#[must_use]
42pub enum ResumeResult {
43    /// Blocked on a SyscallCondition.
44    Blocked(SyscallCondition),
45    /// The native thread has exited with the given code.
46    ExitedThread(i32),
47    /// The thread's process has exited.
48    ExitedProcess,
49}
50
51pub struct ManagedThread {
52    ipc_shmem: Arc<ShMemBlock<'static, IPCData>>,
53    is_running: Cell<bool>,
54    return_code: Cell<Option<i32>>,
55
56    /* holds the event for the most recent call from the plugin/shim */
57    current_event: RefCell<ShimEventToShadow>,
58
59    native_pid: linux_api::posix_types::Pid,
60    native_tid: linux_api::posix_types::Pid,
61
62    // Value storing the current CPU affinity of the thread (more precisely,
63    // of the native thread backing this thread object). This value will be set
64    // to AFFINITY_UNINIT if CPU pinning is not enabled or if the thread has
65    // not yet been pinned to a CPU.
66    affinity: Cell<i32>,
67}
68
69impl ManagedThread {
70    pub fn native_pid(&self) -> linux_api::posix_types::Pid {
71        self.native_pid
72    }
73
74    pub fn native_tid(&self) -> linux_api::posix_types::Pid {
75        self.native_tid
76    }
77
78    /// Make the specified syscall on the native thread.
79    ///
80    /// Panics if the native thread is dead or dies during the syscall,
81    /// including if the syscall itself is SYS_exit or SYS_exit_group.
82    pub fn native_syscall(&self, ctx: &ThreadContext, n: i64, args: &[SyscallReg]) -> SyscallReg {
83        let mut syscall_args = SyscallArgs {
84            number: n,
85            args: [SyscallReg::from(0u64); 6],
86        };
87        syscall_args.args[..args.len()].copy_from_slice(args);
88        match self.continue_plugin(
89            ctx.host,
90            &ShimEventToShim::Syscall(ShimEventSyscall { syscall_args }),
91        ) {
92            ShimEventToShadow::SyscallComplete(res) => res.retval,
93            other => panic!("Unexpected response from plugin: {other:?}"),
94        }
95    }
96
97    pub fn spawn(
98        plugin_path: &CStr,
99        argv: Vec<CString>,
100        envv: Vec<CString>,
101        strace_file: Option<&std::fs::File>,
102        log_file: &std::fs::File,
103        injected_preloads: &[PathBuf],
104    ) -> Result<Self, Errno> {
105        debug!(
106            "spawning new mthread '{plugin_path:?}' with environment '{envv:?}', arguments '{argv:?}'"
107        );
108
109        let envv = inject_preloads(envv, injected_preloads);
110
111        debug!("env after preload injection: {envv:?}");
112
113        let ipc_shmem = Arc::new(shadow_shmem::allocator::shmalloc(IPCData::new()));
114
115        let child_pid =
116            Self::spawn_native(plugin_path, argv, envv, strace_file, log_file, &ipc_shmem)?;
117
118        // In Linux, the PID is equal to the TID of its first thread.
119        let native_pid = child_pid;
120        let native_tid = child_pid;
121
122        // Configure the child_pid_watcher to close the IPC channel when the child dies.
123        {
124            let worker = WORKER_SHARED.borrow();
125            let watcher = worker.as_ref().unwrap().child_pid_watcher();
126
127            watcher.register_pid(child_pid);
128            let ipc = ipc_shmem.clone();
129            watcher.register_callback(child_pid, move |_pid| {
130                ipc.from_plugin().close_writer();
131            })
132        };
133
134        trace!(
135            "waiting for start event from shim with native pid {:?}",
136            native_pid
137        );
138        let start_req = ipc_shmem.from_plugin().receive().unwrap();
139        match &start_req {
140            ShimEventToShadow::StartReq(_) => {
141                // Expected result; shim is ready to initialize.
142            }
143            ShimEventToShadow::ProcessDeath => {
144                // The process died before initializing the shim.
145                //
146                // Reap the dead process and return an error.
147                let status =
148                    rustix::process::waitpid(Some(native_pid.into()), WaitOptions::empty())
149                        .unwrap()
150                        .unwrap();
151                if status.exit_status() == Some(127) {
152                    // posix_spawn(3):
153                    // > If  the child  fails  in  any  of the
154                    // > housekeeping steps described below, or fails to
155                    // > execute the desired file, it exits with a status of
156                    // > 127.
157                    debug!("posix_spawn failed to exec the process");
158                    // Assume that execve failed, and return a plausible reason
159                    // why it might have done so.
160                    // TODO: replace our usage of posix_spawn with a custom
161                    // implementation that can return the execve failure code?
162                    return Err(Errno::EPERM);
163                }
164                // TODO: handle more gracefully.
165                // * The native stdout/stderr might have a clue as to
166                // why the process died.  Consider logging a hint to
167                // check it (currently in the corresponding shimlog), or
168                // directly capture it and display it here.
169                // https://github.com/shadow/shadow/issues/3142
170                // * Consider logging a warning here and continuing on to handle
171                // the managed process exit normally. e.g. when this happens
172                // as part of an emulated `execve`, we might want to continue
173                // the simulation.
174                panic!("Child process died unexpectedly before initialization: {status:?}");
175            }
176            other => panic!("Unexpected result from shim: {other:?}"),
177        };
178
179        Ok(Self {
180            ipc_shmem,
181            is_running: Cell::new(true),
182            return_code: Cell::new(None),
183            current_event: RefCell::new(start_req),
184            native_pid,
185            native_tid,
186            affinity: Cell::new(cshadow::AFFINITY_UNINIT),
187        })
188    }
189
190    pub fn resume(
191        &self,
192        ctx: &ThreadContext,
193        syscall_handler: &mut SyscallHandler,
194    ) -> ResumeResult {
195        debug_assert!(self.is_running());
196
197        self.sync_affinity_with_worker();
198
199        // Flush any pending writes, e.g. from a previous mthread that exited
200        // without flushing.
201        ctx.process.free_unsafe_borrows_flush().unwrap();
202
203        loop {
204            let mut current_event = self.current_event.borrow_mut();
205            let last_event = *current_event;
206            *current_event = match last_event {
207                ShimEventToShadow::StartReq(start_req) => {
208                    // Write the serialized thread shmem handle directly to shim
209                    // memory.
210                    ctx.process
211                        .memory_borrow_mut()
212                        .write(
213                            start_req.thread_shmem_block_to_init,
214                            &ctx.thread.shmem().serialize(),
215                        )
216                        .unwrap();
217
218                    if !start_req.process_shmem_block_to_init.is_null() {
219                        // Write the serialized process shmem handle directly to
220                        // shim memory.
221                        ctx.process
222                            .memory_borrow_mut()
223                            .write(
224                                start_req.process_shmem_block_to_init,
225                                &ctx.process.shmem().serialize(),
226                            )
227                            .unwrap();
228                    }
229
230                    if !start_req.initial_working_dir_to_init.is_null() {
231                        // Write the working dir.
232                        let mut mem = ctx.process.memory_borrow_mut();
233                        let mut writer = mem.writer(ForeignArrayPtr::new(
234                            start_req.initial_working_dir_to_init,
235                            start_req.initial_working_dir_to_init_len,
236                        ));
237                        writer
238                            .write_all(ctx.process.current_working_dir().to_bytes_with_nul())
239                            .unwrap();
240                        writer.flush().unwrap();
241                    }
242
243                    // send the message to the shim to call main().
244                    trace!("sending start event code to shim");
245                    self.continue_plugin(
246                        ctx.host,
247                        &ShimEventToShim::StartRes(ShimEventStartRes {
248                            auxvec_random: ctx.host.random_mut().random(),
249                        }),
250                    )
251                }
252                ShimEventToShadow::ProcessDeath => {
253                    // The native threads are all dead or zombies. Nothing to do but
254                    // clean up.
255                    self.cleanup_after_exit_initiated();
256                    return ResumeResult::ExitedProcess;
257                }
258                ShimEventToShadow::Syscall(syscall) => {
259                    // Emulate the given syscall.
260
261                    // `exit` is tricky since it only exits the *mthread*, and we don't have a way
262                    // to be notified that the mthread has exited. We have to "fire and forget"
263                    // the command to execute the syscall natively.
264                    //
265                    // TODO: We could use a tid futex in shared memory, as set by
266                    // `set_tid_address`, to block here until the thread has
267                    // actually exited.
268                    if syscall.syscall_args.number == libc::SYS_exit {
269                        let return_code = syscall.syscall_args.args[0].into();
270                        debug!("Short-circuiting syscall exit({return_code})");
271                        self.return_code.set(Some(return_code));
272                        // Tell mthread to go ahead and make the exit syscall itself.
273                        // We *don't* call `_managedthread_continuePlugin` here,
274                        // since that'd release the ShimSharedMemHostLock, and we
275                        // aren't going to get a message back to know when it'd be
276                        // safe to take it again.
277                        self.ipc_shmem
278                            .to_plugin()
279                            .send(ShimEventToShim::SyscallDoNative);
280                        self.cleanup_after_exit_initiated();
281                        return ResumeResult::ExitedThread(return_code);
282                    }
283
284                    let scr = syscall_handler.syscall(ctx, &syscall.syscall_args).into();
285
286                    // remove the mthread's old syscall condition since it's no longer needed
287                    ctx.thread.cleanup_syscall_condition();
288
289                    assert!(self.is_running());
290
291                    // Flush any writes that legacy C syscallhandlers may have
292                    // made.
293                    ctx.process.free_unsafe_borrows_flush().unwrap();
294
295                    match scr {
296                        SyscallReturn::Block(b) => {
297                            return ResumeResult::Blocked(unsafe {
298                                SyscallCondition::consume_from_c(b.cond)
299                            });
300                        }
301                        SyscallReturn::Done(d) => self.continue_plugin(
302                            ctx.host,
303                            &ShimEventToShim::SyscallComplete(ShimEventSyscallComplete {
304                                retval: d.retval,
305                                restartable: d.restartable,
306                            }),
307                        ),
308                        SyscallReturn::Native => {
309                            self.continue_plugin(ctx.host, &ShimEventToShim::SyscallDoNative)
310                        }
311                    }
312                }
313                ShimEventToShadow::AddThreadRes(res) => {
314                    // We get here in the child process after forking.
315
316                    // Child should have gotten 0 back from its native clone syscall.
317                    assert_eq!(res.clone_res, 0);
318
319                    // Complete the virtualized clone syscall.
320                    self.continue_plugin(
321                        ctx.host,
322                        &ShimEventToShim::SyscallComplete(ShimEventSyscallComplete {
323                            retval: 0.into(),
324                            restartable: false,
325                        }),
326                    )
327                }
328                e @ ShimEventToShadow::SyscallComplete(_) => panic!("Unexpected event: {e:?}"),
329            };
330            assert!(self.is_running());
331        }
332    }
333
334    pub fn handle_process_exit(&self) {
335        // TODO: Only do this once per process; maybe by moving into `Process`.
336        WORKER_SHARED
337            .borrow()
338            .as_ref()
339            .unwrap()
340            .child_pid_watcher()
341            .unregister_pid(self.native_pid());
342
343        self.cleanup_after_exit_initiated();
344    }
345
346    pub fn return_code(&self) -> Option<i32> {
347        self.return_code.get()
348    }
349
350    pub fn is_running(&self) -> bool {
351        self.is_running.get()
352    }
353
354    /// Execute the specified `clone` syscall in `self`, and use create a new
355    /// `ManagedThread` object to manage it. The new thread will be managed
356    /// by Shadow, and suitable for use with `Thread::wrap_mthread`.
357    ///
358    /// If the `clone` syscall fails, the native error is returned.
359    pub fn native_clone(
360        &self,
361        ctx: &ThreadContext,
362        flags: CloneFlags,
363        child_stack: ForeignPtr<()>,
364        ptid: ForeignPtr<libc::pid_t>,
365        ctid: ForeignPtr<libc::pid_t>,
366        newtls: libc::c_ulong,
367    ) -> Result<ManagedThread, linux_api::errno::Errno> {
368        let child_ipc_shmem = Arc::new(shadow_shmem::allocator::shmalloc(IPCData::new()));
369
370        // Send the IPC block for the new mthread to use.
371        let clone_res: i64 = match self.continue_plugin(
372            ctx.host,
373            &ShimEventToShim::AddThreadReq(ShimEventAddThreadReq {
374                ipc_block: child_ipc_shmem.serialize(),
375                flags: flags.bits(),
376                child_stack,
377                ptid: ptid.cast::<()>(),
378                ctid: ctid.cast::<()>(),
379                newtls,
380            }),
381        ) {
382            ShimEventToShadow::AddThreadRes(ShimEventAddThreadRes { clone_res }) => clone_res,
383            r => panic!("Unexpected result: {r:?}"),
384        };
385        let clone_res: SyscallReg = syscall::raw_return_value_to_result(clone_res)?;
386        let child_native_tid = Pid::from_raw(libc::pid_t::from(clone_res)).unwrap();
387        trace!("native clone treated tid {child_native_tid:?}");
388
389        trace!(
390            "waiting for start event from shim with native tid {:?}",
391            child_native_tid
392        );
393        let start_req = child_ipc_shmem.from_plugin().receive().unwrap();
394        match &start_req {
395            ShimEventToShadow::StartReq(_) => (),
396            other => panic!("Unexpected result from shim: {other:?}"),
397        };
398
399        let native_pid = if flags.contains(CloneFlags::CLONE_THREAD) {
400            self.native_pid
401        } else {
402            child_native_tid
403        };
404
405        if !flags.contains(CloneFlags::CLONE_THREAD) {
406            // Child is a new process; register it.
407            WORKER_SHARED
408                .borrow()
409                .as_ref()
410                .unwrap()
411                .child_pid_watcher()
412                .register_pid(native_pid);
413        }
414
415        // Register the child thread's IPC block with the ChildPidWatcher.
416        {
417            let child_ipc_shmem = child_ipc_shmem.clone();
418            WORKER_SHARED
419                .borrow()
420                .as_ref()
421                .unwrap()
422                .child_pid_watcher()
423                .register_callback(native_pid, move |_pid| {
424                    child_ipc_shmem.from_plugin().close_writer();
425                })
426        };
427
428        Ok(Self {
429            ipc_shmem: child_ipc_shmem,
430            is_running: Cell::new(true),
431            return_code: Cell::new(None),
432            current_event: RefCell::new(start_req),
433            native_pid,
434            native_tid: child_native_tid,
435            // TODO: can we assume it's inherited from the current thread affinity?
436            affinity: Cell::new(cshadow::AFFINITY_UNINIT),
437        })
438    }
439
440    #[must_use]
441    fn continue_plugin(&self, host: &Host, event: &ShimEventToShim) -> ShimEventToShadow {
442        // Update shared state before transferring control.
443        host.shim_shmem_lock_borrow_mut().unwrap().max_runahead_time =
444            Worker::max_event_runahead_time(host);
445        host.shim_shmem()
446            .sim_time
447            .store(Worker::current_time().unwrap(), atomic::Ordering::Relaxed);
448
449        // Release lock so that plugin can take it. Reacquired in `wait_for_next_event`.
450        host.unlock_shmem();
451
452        self.ipc_shmem.to_plugin().send(*event);
453
454        let event = match self.ipc_shmem.from_plugin().receive() {
455            Ok(e) => e,
456            Err(SelfContainedChannelError::WriterIsClosed) => ShimEventToShadow::ProcessDeath,
457        };
458
459        // Reacquire the shared memory lock, now that the shim has yielded control
460        // back to us.
461        host.lock_shmem();
462
463        // Update time, which may have been incremented in the shim.
464        let shim_time = host.shim_shmem().sim_time.load(atomic::Ordering::Relaxed);
465        if log_enabled!(Level::Trace) {
466            let worker_time = Worker::current_time().unwrap();
467            if shim_time != worker_time {
468                trace!(
469                    "Updating time from {worker_time:?} to {shim_time:?} (+{:?})",
470                    shim_time - worker_time
471                );
472            }
473        }
474        Worker::set_current_time(shim_time);
475
476        event
477    }
478
479    /// To be called after we expect the native thread to have exited, or to
480    /// exit imminently.
481    fn cleanup_after_exit_initiated(&self) {
482        if !self.is_running.get() {
483            return;
484        }
485        self.wait_for_native_exit();
486        trace!("child {:?} exited", self.native_tid());
487        self.is_running.set(false);
488    }
489
490    /// Wait until the managed thread is no longer running.
491    fn wait_for_native_exit(&self) {
492        let native_pid = self.native_pid();
493        let native_tid = self.native_tid();
494
495        // We use `tgkill` and `/proc/x/stat` to detect whether the thread is still running,
496        // looping until it doesn't.
497        //
498        // Alternatively we could use `set_tid_address` or `set_robust_list` to
499        // be notified on a futex. Those are a bit underdocumented and fragile,
500        // though. In practice this shouldn't have to loop significantly.
501        trace!("Waiting for native thread {native_pid:?}.{native_tid:?} to exit");
502        loop {
503            if self.ipc_shmem.from_plugin().writer_is_closed() {
504                // This indicates that the whole process has stopped executing;
505                // no need to poll the individual thread.
506                break;
507            }
508            match tgkill(native_pid, native_tid, None) {
509                Err(Errno::ESRCH) => {
510                    trace!("Thread is done exiting; proceeding with cleanup");
511                    break;
512                }
513                Err(e) => {
514                    error!("Unexpected tgkill error: {:?}", e);
515                    break;
516                }
517                Ok(()) if native_pid == native_tid => {
518                    // Thread leader could be in a zombie state waiting for
519                    // the other threads to exit.
520                    let filename = format!("/proc/{}/stat", native_pid.as_raw_nonzero().get());
521                    let stat = match std::fs::read_to_string(filename) {
522                        Err(e) => {
523                            assert!(e.kind() == std::io::ErrorKind::NotFound);
524                            trace!("tgl {native_pid:?} is fully dead");
525                            break;
526                        }
527                        Ok(s) => s,
528                    };
529                    if stat.contains(") Z") {
530                        trace!("tgl {native_pid:?} is a zombie");
531                        break;
532                    }
533                    // Still alive and in a non-zombie state; continue
534                }
535                Ok(()) => {
536                    // Thread is still alive; continue.
537                }
538            };
539            std::thread::yield_now();
540        }
541    }
542
543    fn sync_affinity_with_worker(&self) {
544        let current_affinity = scheduler::core_affinity()
545            .map(|x| i32::try_from(x).unwrap())
546            .unwrap_or(cshadow::AFFINITY_UNINIT);
547        self.affinity.set(unsafe {
548            cshadow::affinity_setProcessAffinity(
549                self.native_tid().as_raw_nonzero().get(),
550                current_affinity,
551                self.affinity.get(),
552            )
553        });
554    }
555
556    fn spawn_native(
557        plugin_path: &CStr,
558        argv: Vec<CString>,
559        envv: Vec<CString>,
560        strace_file: Option<&std::fs::File>,
561        shimlog_file: &std::fs::File,
562        shmem_block: &ShMemBlock<IPCData>,
563    ) -> Result<Pid, Errno> {
564        // Preemptively check for likely reasons that execve might fail.
565        // In particular we want to ensure that we  don't launch a statically
566        // linked executable, since we'd then deadlock the whole simulation
567        // waiting for the plugin to initialize.
568        //
569        // This is also helpful since we can't retrieve specific `execve` errors
570        // through `posix_spawn`.
571        fn map_verify_err(e: VerifyPluginPathError) -> Errno {
572            match e {
573                // execve(2): ENOENT The file pathname [...] does not exist.
574                VerifyPluginPathError::NotFound => Errno::ENOENT,
575                // execve(2): EACCES The file or a script interpreter is not a regular file.
576                VerifyPluginPathError::NotFile => Errno::EACCES,
577                // execve(2): EACCES Execute permission is denied for the file or a script or ELF interpreter.
578                VerifyPluginPathError::NotExecutable => Errno::EACCES,
579                // execve(2): ENOEXEC An executable is not in a recognized
580                // format, is for the wrong architecture, or has some other
581                // format error that means it cannot be executed.
582                VerifyPluginPathError::UnknownFileType => Errno::ENOEXEC,
583                VerifyPluginPathError::NotDynamicallyLinkedElf => Errno::ENOEXEC,
584                VerifyPluginPathError::IncompatibleInterpreter(e) => map_verify_err(*e),
585                // execve(2): EACCES Search permission is denied on a component
586                // of the path prefix of pathname or the name of a script
587                // interpreter.
588                VerifyPluginPathError::PathPermissionDenied => Errno::EACCES,
589                VerifyPluginPathError::UnhandledIoError(_) => {
590                    // Arbitrary error that should be handled by callers.
591                    Errno::ENOEXEC
592                }
593            }
594        }
595        verify_plugin_path(std::ffi::OsStr::from_bytes(plugin_path.to_bytes()))
596            .map_err(map_verify_err)?;
597
598        // posix_spawn is documented as taking pointers to *mutable* char for argv and
599        // envv. It *probably* doesn't actually mutate them, but we
600        // conservatively give it what it asks for. We have to "reconstitute"
601        // the CString's after the fork + exec to deallocate them.
602        let argv_ptrs: Vec<*mut i8> = argv
603            .into_iter()
604            .map(CString::into_raw)
605            // the last element of argv must be NULL
606            .chain(std::iter::once(std::ptr::null_mut()))
607            .collect();
608        let envv_ptrs: Vec<*mut i8> = envv
609            .into_iter()
610            .map(CString::into_raw)
611            // the last element of argv must be NULL
612            .chain(std::iter::once(std::ptr::null_mut()))
613            .collect();
614
615        let mut file_actions: libc::posix_spawn_file_actions_t = shadow_pod::zeroed();
616        Errno::result_from_libc_errnum(unsafe {
617            libc::posix_spawn_file_actions_init(&mut file_actions)
618        })
619        .unwrap();
620
621        // Set up stdin
622        let (stdin_reader, stdin_writer) = rustix::pipe::pipe_with(PipeFlags::CLOEXEC).unwrap();
623        Errno::result_from_libc_errnum(unsafe {
624            libc::posix_spawn_file_actions_adddup2(
625                &mut file_actions,
626                stdin_reader.as_raw_fd(),
627                libc::STDIN_FILENO,
628            )
629        })
630        .unwrap();
631
632        // Dup straceFd; the dup'd descriptor won't have O_CLOEXEC set.
633        //
634        // Since dup2 is a no-op when the new and old file descriptors are equal, we have
635        // to arrange to call dup2 twice - first to a temporary descriptor, and then back
636        // to the original descriptor number.
637        //
638        // Here we use STDOUT_FILENO as the temporary descriptor, since we later
639        // replace that below.
640        //
641        // Once we drop support for platforms with glibc older than 2.29, we *could*
642        // consider taking advantage of a new feature that would let us just use a
643        // single `posix_spawn_file_actions_adddup2` call with equal descriptors.
644        // OTOH it's a non-standard extension, and I think ultimately uses the same
645        // number of syscalls, so it might be better to continue using this slightly
646        // more awkward method anyway.
647        // https://github.com/bminor/glibc/commit/805334b26c7e6e83557234f2008497c72176a6cd
648        // https://austingroupbugs.net/view.php?id=411
649        if let Some(strace_file) = strace_file {
650            Errno::result_from_libc_errnum(unsafe {
651                libc::posix_spawn_file_actions_adddup2(
652                    &mut file_actions,
653                    strace_file.as_raw_fd(),
654                    libc::STDOUT_FILENO,
655                )
656            })
657            .unwrap();
658            Errno::result_from_libc_errnum(unsafe {
659                libc::posix_spawn_file_actions_adddup2(
660                    &mut file_actions,
661                    libc::STDOUT_FILENO,
662                    strace_file.as_raw_fd(),
663                )
664            })
665            .unwrap();
666        }
667
668        // set stdout/stderr as the shim log. This also clears the FD_CLOEXEC flag.
669        Errno::result_from_libc_errnum(unsafe {
670            libc::posix_spawn_file_actions_adddup2(
671                &mut file_actions,
672                shimlog_file.as_raw_fd(),
673                libc::STDOUT_FILENO,
674            )
675        })
676        .unwrap();
677        Errno::result_from_libc_errnum(unsafe {
678            libc::posix_spawn_file_actions_adddup2(
679                &mut file_actions,
680                shimlog_file.as_raw_fd(),
681                libc::STDERR_FILENO,
682            )
683        })
684        .unwrap();
685
686        let mut spawn_attr: libc::posix_spawnattr_t = shadow_pod::zeroed();
687        Errno::result_from_libc_errnum(unsafe { libc::posix_spawnattr_init(&mut spawn_attr) })
688            .unwrap();
689
690        // In versions of glibc before 2.24, we need this to tell posix_spawn
691        // to use vfork instead of fork. In later versions it's a no-op.
692        Errno::result_from_libc_errnum(unsafe {
693            libc::posix_spawnattr_setflags(
694                &mut spawn_attr,
695                libc::POSIX_SPAWN_USEVFORK.try_into().unwrap(),
696            )
697        })
698        .unwrap();
699
700        let child_pid_res = {
701            let mut child_pid = -1;
702            Errno::result_from_libc_errnum(unsafe {
703                libc::posix_spawn(
704                    &mut child_pid,
705                    plugin_path.as_ptr(),
706                    &file_actions,
707                    &spawn_attr,
708                    argv_ptrs.as_ptr(),
709                    envv_ptrs.as_ptr(),
710                )
711            })
712            .map(|_| Pid::from_raw(child_pid).unwrap_or_else(|| panic!("Invalid pid: {child_pid}")))
713        };
714
715        // Write the serialized shmem descriptor to the stdin pipe. The pipe
716        // buffer should be large enough that we can write it all without having
717        // to wait for data to be read.
718        if child_pid_res.is_ok() {
719            // we avoid using the rustix write wrapper here, since we can't guarantee
720            // that all bytes of the serialized shmem block are initd, and hence
721            // can't safely construct the &[u8] that it wants.
722            let serialized = shmem_block.serialize();
723            let serialized_bytes = shadow_pod::as_u8_slice(&serialized);
724            let written = Errno::result_from_libc_errno(-1, unsafe {
725                libc::write(
726                    stdin_writer.as_raw_fd(),
727                    serialized_bytes.as_ptr().cast(),
728                    serialized_bytes.len(),
729                )
730            })
731            .unwrap();
732            // TODO: loop if needed. Shouldn't be in practice, though.
733            assert_eq!(written, isize::try_from(serialized_bytes.len()).unwrap());
734        }
735
736        Errno::result_from_libc_errnum(unsafe {
737            libc::posix_spawn_file_actions_destroy(&mut file_actions)
738        })
739        .unwrap();
740        Errno::result_from_libc_errnum(unsafe { libc::posix_spawnattr_destroy(&mut spawn_attr) })
741            .unwrap();
742
743        // Drop the cloned argv and env.
744        drop(
745            argv_ptrs
746                .into_iter()
747                .filter(|p| !p.is_null())
748                .map(|p| unsafe { CString::from_raw(p) }),
749        );
750        drop(
751            envv_ptrs
752                .into_iter()
753                .filter(|p| !p.is_null())
754                .map(|p| unsafe { CString::from_raw(p) }),
755        );
756
757        debug!(
758            "starting process {}, result: {child_pid_res:?}",
759            plugin_path.to_str().unwrap()
760        );
761
762        child_pid_res
763    }
764
765    /// `ManagedThread` panics if dropped while the underlying process is still running,
766    /// since otherwise that process could continue writing to shared memory regions
767    /// that shadow reallocates.
768    ///
769    /// This method kills the process that `self` belongs to (not just the
770    /// thread!) and then drops `self`.
771    pub fn kill_and_drop(self) {
772        if let Err(err) =
773            rustix::process::kill_process(self.native_pid().into(), rustix::process::Signal::Kill)
774        {
775            log::warn!(
776                "Couldn't kill managed process {:?}. kill: {:?}",
777                self.native_pid(),
778                err
779            );
780        }
781        self.handle_process_exit();
782    }
783}
784
785impl Drop for ManagedThread {
786    fn drop(&mut self) {
787        // Dropping while the thread is running is unsound because the running
788        // thread still has access to shared memory regions that will be
789        // deallocated, and potentially reallocated for another purpose. The
790        // running thread accessing a deallocated or repurposed memory region
791        // can cause numerous problems.
792        assert!(!self.is_running());
793    }
794}