shadow_rs/host/syscall/handler/
unistd.rs

1use std::ffi::{CStr, CString};
2use std::os::unix::ffi::OsStringExt;
3use std::sync::Arc;
4
5use atomic_refcell::AtomicRefCell;
6use linux_api::errno::Errno;
7use linux_api::fcntl::{DescriptorFlags, OFlag};
8use linux_api::posix_types::{kernel_off_t, kernel_pid_t};
9use log::*;
10use shadow_shim_helper_rs::emulated_time::EmulatedTime;
11use shadow_shim_helper_rs::rootedcell::refcell::RootedRefCell;
12use shadow_shim_helper_rs::simulation_time::SimulationTime;
13use shadow_shim_helper_rs::syscall_types::ForeignPtr;
14
15use crate::core::work::task::TaskRef;
16use crate::core::worker::Worker;
17use crate::cshadow as c;
18use crate::host::descriptor::descriptor_table::DescriptorHandle;
19use crate::host::descriptor::pipe;
20use crate::host::descriptor::shared_buf::SharedBuf;
21use crate::host::descriptor::{CompatFile, Descriptor, File, FileMode, FileStatus, OpenFile};
22use crate::host::process::{Process, ProcessId};
23use crate::host::syscall::handler::{SyscallContext, SyscallHandler};
24use crate::host::syscall::io::{IoVec, read_cstring_vec};
25use crate::host::syscall::type_formatting::{SyscallBufferArg, SyscallStringArg};
26use crate::host::syscall::types::{ForeignArrayPtr, SyscallError};
27use crate::utility::callback_queue::CallbackQueue;
28use crate::utility::u8_to_i8_slice;
29
30impl SyscallHandler {
31    log_syscall!(
32        close,
33        /* rv */ std::ffi::c_int,
34        /* fd */ std::ffi::c_int,
35    );
36    pub fn close(ctx: &mut SyscallContext, fd: std::ffi::c_int) -> Result<(), SyscallError> {
37        trace!("Trying to close fd {}", fd);
38
39        let fd = fd.try_into().or(Err(linux_api::errno::Errno::EBADF))?;
40
41        // according to "man 2 close", in Linux any errors that may occur will happen after the fd is
42        // released, so we should always deregister the descriptor even if there's an error while
43        // closing
44        let desc = ctx
45            .objs
46            .thread
47            .descriptor_table_borrow_mut(ctx.objs.host)
48            .deregister_descriptor(fd)
49            .ok_or(linux_api::errno::Errno::EBADF)?;
50
51        // if there are still valid descriptors to the open file, close() will do nothing
52        // and return None
53        CallbackQueue::queue_and_run_with_legacy(|cb_queue| desc.close(ctx.objs.host, cb_queue))
54            .unwrap_or(Ok(()))
55    }
56
57    log_syscall!(
58        dup,
59        /* rv */ std::ffi::c_int,
60        /* oldfd */ std::ffi::c_int,
61    );
62    pub fn dup(
63        ctx: &mut SyscallContext,
64        fd: std::ffi::c_int,
65    ) -> Result<DescriptorHandle, SyscallError> {
66        // get the descriptor, or return early if it doesn't exist
67        let mut desc_table = ctx.objs.thread.descriptor_table_borrow_mut(ctx.objs.host);
68        let desc = Self::get_descriptor(&desc_table, fd)?;
69
70        // duplicate the descriptor
71        let new_desc = desc.dup(DescriptorFlags::empty());
72
73        Ok(desc_table
74            .register_descriptor(new_desc)
75            .or(Err(Errno::ENFILE))?)
76    }
77
78    log_syscall!(
79        dup2,
80        /* rv */ std::ffi::c_int,
81        /* oldfd */ std::ffi::c_int,
82        /* newfd */ std::ffi::c_int,
83    );
84    pub fn dup2(
85        ctx: &mut SyscallContext,
86        old_fd: std::ffi::c_int,
87        new_fd: std::ffi::c_int,
88    ) -> Result<DescriptorHandle, SyscallError> {
89        let old_fd = DescriptorHandle::try_from(old_fd).or(Err(Errno::EBADF))?;
90        let new_fd = DescriptorHandle::try_from(new_fd).or(Err(Errno::EBADF))?;
91
92        // get the descriptor, or return early if it doesn't exist
93        let mut desc_table = ctx.objs.thread.descriptor_table_borrow_mut(ctx.objs.host);
94        let desc = Self::get_descriptor(&desc_table, old_fd)?;
95
96        // from 'man 2 dup2': "If oldfd is a valid file descriptor, and newfd has the same
97        // value as oldfd, then dup2() does nothing, and returns newfd"
98        if old_fd == new_fd {
99            return Ok(new_fd);
100        }
101
102        // duplicate the descriptor
103        let new_desc = desc.dup(DescriptorFlags::empty());
104        let replaced_desc = desc_table.register_descriptor_with_fd(new_desc, new_fd);
105
106        // close the replaced descriptor
107        if let Some(replaced_desc) = replaced_desc {
108            // from 'man 2 dup2': "If newfd was open, any errors that would have been reported at
109            // close(2) time are lost"
110            CallbackQueue::queue_and_run_with_legacy(|cb_queue| {
111                replaced_desc.close(ctx.objs.host, cb_queue)
112            });
113        }
114
115        // return the new fd
116        Ok(new_fd)
117    }
118
119    log_syscall!(
120        dup3,
121        /* rv */ std::ffi::c_int,
122        /* oldfd */ std::ffi::c_int,
123        /* newfd */ std::ffi::c_int,
124        /* flags */ linux_api::fcntl::OFlag,
125    );
126    pub fn dup3(
127        ctx: &mut SyscallContext,
128        old_fd: std::ffi::c_int,
129        new_fd: std::ffi::c_int,
130        flags: std::ffi::c_int,
131    ) -> Result<DescriptorHandle, SyscallError> {
132        // get the descriptor, or return early if it doesn't exist
133        let mut desc_table = ctx.objs.thread.descriptor_table_borrow_mut(ctx.objs.host);
134        let desc = Self::get_descriptor(&desc_table, old_fd)?;
135
136        // from 'man 2 dup3': "If oldfd equals newfd, then dup3() fails with the error EINVAL"
137        if old_fd == new_fd {
138            return Err(linux_api::errno::Errno::EINVAL.into());
139        }
140
141        let new_fd = new_fd.try_into().or(Err(linux_api::errno::Errno::EBADF))?;
142
143        let Some(flags) = OFlag::from_bits(flags) else {
144            debug!("Invalid flags: {flags}");
145            return Err(linux_api::errno::Errno::EINVAL.into());
146        };
147
148        let mut descriptor_flags = DescriptorFlags::empty();
149
150        // dup3 only supports the O_CLOEXEC flag
151        for flag in flags {
152            match flag {
153                OFlag::O_CLOEXEC => descriptor_flags.insert(DescriptorFlags::FD_CLOEXEC),
154                x if x == OFlag::empty() => {
155                    // The "empty" flag is always present. Ignore.
156                }
157                _ => {
158                    debug!("Invalid flags for dup3: {flags:?}");
159                    return Err(linux_api::errno::Errno::EINVAL.into());
160                }
161            }
162        }
163
164        // duplicate the descriptor
165        let new_desc = desc.dup(descriptor_flags);
166        let replaced_desc = desc_table.register_descriptor_with_fd(new_desc, new_fd);
167
168        // close the replaced descriptor
169        if let Some(replaced_desc) = replaced_desc {
170            // from 'man 2 dup3': "If newfd was open, any errors that would have been reported at
171            // close(2) time are lost"
172            CallbackQueue::queue_and_run_with_legacy(|cb_queue| {
173                replaced_desc.close(ctx.objs.host, cb_queue)
174            });
175        }
176
177        // return the new fd
178        Ok(new_fd)
179    }
180
181    log_syscall!(
182        read,
183        /* rv */ isize,
184        /* fd */ std::ffi::c_int,
185        /* buf */ *const std::ffi::c_void,
186        /* count */ usize,
187    );
188    pub fn read(
189        ctx: &mut SyscallContext,
190        fd: std::ffi::c_int,
191        buf_ptr: ForeignPtr<u8>,
192        buf_size: usize,
193    ) -> Result<isize, SyscallError> {
194        // if we were previously blocked, get the active file from the last syscall handler
195        // invocation since it may no longer exist in the descriptor table
196        let file = ctx
197            .objs
198            .thread
199            .syscall_condition()
200            // if this was for a C descriptor, then there won't be an active file object
201            .and_then(|x| x.active_file().cloned());
202
203        let file = match file {
204            // we were previously blocked, so re-use the file from the previous syscall invocation
205            Some(x) => x,
206            // get the file from the descriptor table, or return early if it doesn't exist
207            None => {
208                let desc_table = ctx.objs.thread.descriptor_table_borrow(ctx.objs.host);
209                match Self::get_descriptor(&desc_table, fd)?.file() {
210                    CompatFile::New(file) => file.clone(),
211                    // if it's a legacy file, use the C syscall handler instead
212                    CompatFile::Legacy(_) => {
213                        drop(desc_table);
214                        return Self::legacy_syscall(c::syscallhandler_read, ctx);
215                    }
216                }
217            }
218        };
219
220        let mut result = Self::read_helper(ctx, file.inner_file(), buf_ptr, buf_size, None);
221
222        // if the syscall will block, keep the file open until the syscall restarts
223        if let Some(err) = result.as_mut().err() {
224            if let Some(cond) = err.blocked_condition() {
225                cond.set_active_file(file);
226            }
227        }
228
229        let bytes_read = result?;
230        Ok(bytes_read)
231    }
232
233    log_syscall!(
234        pread64,
235        /* rv */ isize,
236        /* fd */ std::ffi::c_int,
237        /* buf */ *const std::ffi::c_void,
238        /* count */ usize,
239        /* offset */ kernel_off_t,
240    );
241    pub fn pread64(
242        ctx: &mut SyscallContext,
243        fd: std::ffi::c_int,
244        buf_ptr: ForeignPtr<u8>,
245        buf_size: usize,
246        offset: kernel_off_t,
247    ) -> Result<isize, SyscallError> {
248        // if we were previously blocked, get the active file from the last syscall handler
249        // invocation since it may no longer exist in the descriptor table
250        let file = ctx
251            .objs
252            .thread
253            .syscall_condition()
254            // if this was for a C descriptor, then there won't be an active file object
255            .and_then(|x| x.active_file().cloned());
256
257        let file = match file {
258            // we were previously blocked, so re-use the file from the previous syscall invocation
259            Some(x) => x,
260            // get the file from the descriptor table, or return early if it doesn't exist
261            None => {
262                let desc_table = ctx.objs.thread.descriptor_table_borrow(ctx.objs.host);
263                match Self::get_descriptor(&desc_table, fd)?.file() {
264                    CompatFile::New(file) => file.clone(),
265                    // if it's a legacy file, use the C syscall handler instead
266                    CompatFile::Legacy(_) => {
267                        drop(desc_table);
268                        return Self::legacy_syscall(c::syscallhandler_pread64, ctx);
269                    }
270                }
271            }
272        };
273
274        let mut result = Self::read_helper(ctx, file.inner_file(), buf_ptr, buf_size, Some(offset));
275
276        // if the syscall will block, keep the file open until the syscall restarts
277        if let Some(err) = result.as_mut().err() {
278            if let Some(cond) = err.blocked_condition() {
279                cond.set_active_file(file);
280            }
281        }
282
283        let bytes_read = result?;
284        Ok(bytes_read)
285    }
286
287    fn read_helper(
288        ctx: &mut SyscallContext,
289        file: &File,
290        buf_ptr: ForeignPtr<u8>,
291        buf_size: usize,
292        offset: Option<kernel_off_t>,
293    ) -> Result<isize, SyscallError> {
294        let iov = IoVec {
295            base: buf_ptr,
296            len: buf_size,
297        };
298        Self::readv_helper(ctx, file, &[iov], offset, 0)
299    }
300
301    log_syscall!(
302        write,
303        /* rv */ isize,
304        /* fd */ std::ffi::c_int,
305        /* buf */ SyscallBufferArg</* count */ 2>,
306        /* count */ usize,
307    );
308    pub fn write(
309        ctx: &mut SyscallContext,
310        fd: std::ffi::c_int,
311        buf_ptr: ForeignPtr<u8>,
312        buf_size: usize,
313    ) -> Result<isize, SyscallError> {
314        // if we were previously blocked, get the active file from the last syscall handler
315        // invocation since it may no longer exist in the descriptor table
316        let file = ctx
317            .objs
318            .thread
319            .syscall_condition()
320            // if this was for a C descriptor, then there won't be an active file object
321            .and_then(|x| x.active_file().cloned());
322
323        let file = match file {
324            // we were previously blocked, so re-use the file from the previous syscall invocation
325            Some(x) => x,
326            // get the file from the descriptor table, or return early if it doesn't exist
327            None => {
328                let desc_table = ctx.objs.thread.descriptor_table_borrow(ctx.objs.host);
329                match Self::get_descriptor(&desc_table, fd)?.file() {
330                    CompatFile::New(file) => file.clone(),
331                    // if it's a legacy file, use the C syscall handler instead
332                    CompatFile::Legacy(_) => {
333                        drop(desc_table);
334                        return Self::legacy_syscall(c::syscallhandler_write, ctx);
335                    }
336                }
337            }
338        };
339
340        let mut result = Self::write_helper(ctx, file.inner_file(), buf_ptr, buf_size, None);
341
342        // if the syscall will block, keep the file open until the syscall restarts
343        if let Some(err) = result.as_mut().err() {
344            if let Some(cond) = err.blocked_condition() {
345                cond.set_active_file(file);
346            }
347        }
348
349        let bytes_written = result?;
350        Ok(bytes_written)
351    }
352
353    log_syscall!(
354        pwrite64,
355        /* rv */ isize,
356        /* fd */ std::ffi::c_int,
357        /* buf */ SyscallBufferArg</* count */ 2>,
358        /* count */ usize,
359        /* offset */ kernel_off_t,
360    );
361    pub fn pwrite64(
362        ctx: &mut SyscallContext,
363        fd: std::ffi::c_int,
364        buf_ptr: ForeignPtr<u8>,
365        buf_size: usize,
366        offset: kernel_off_t,
367    ) -> Result<isize, SyscallError> {
368        // if we were previously blocked, get the active file from the last syscall handler
369        // invocation since it may no longer exist in the descriptor table
370        let file = ctx
371            .objs
372            .thread
373            .syscall_condition()
374            // if this was for a C descriptor, then there won't be an active file object
375            .and_then(|x| x.active_file().cloned());
376
377        let file = match file {
378            // we were previously blocked, so re-use the file from the previous syscall invocation
379            Some(x) => x,
380            // get the file from the descriptor table, or return early if it doesn't exist
381            None => {
382                let desc_table = ctx.objs.thread.descriptor_table_borrow(ctx.objs.host);
383                match Self::get_descriptor(&desc_table, fd)?.file() {
384                    CompatFile::New(file) => file.clone(),
385                    // if it's a legacy file, use the C syscall handler instead
386                    CompatFile::Legacy(_) => {
387                        drop(desc_table);
388                        return Self::legacy_syscall(c::syscallhandler_pwrite64, ctx);
389                    }
390                }
391            }
392        };
393
394        let mut result =
395            Self::write_helper(ctx, file.inner_file(), buf_ptr, buf_size, Some(offset));
396
397        // if the syscall will block, keep the file open until the syscall restarts
398        if let Some(err) = result.as_mut().err() {
399            if let Some(cond) = err.blocked_condition() {
400                cond.set_active_file(file);
401            }
402        }
403
404        let bytes_written = result?;
405        Ok(bytes_written)
406    }
407
408    fn write_helper(
409        ctx: &mut SyscallContext,
410        file: &File,
411        buf_ptr: ForeignPtr<u8>,
412        buf_size: usize,
413        offset: Option<kernel_off_t>,
414    ) -> Result<isize, SyscallError> {
415        let iov = IoVec {
416            base: buf_ptr,
417            len: buf_size,
418        };
419        Self::writev_helper(ctx, file, &[iov], offset, 0)
420    }
421
422    log_syscall!(
423        pipe,
424        /* rv */ std::ffi::c_int,
425        /* pipefd */ [std::ffi::c_int; 2],
426    );
427    pub fn pipe(
428        ctx: &mut SyscallContext,
429        fd_ptr: ForeignPtr<[std::ffi::c_int; 2]>,
430    ) -> Result<(), SyscallError> {
431        Self::pipe_helper(ctx, fd_ptr, 0)
432    }
433
434    log_syscall!(
435        pipe2,
436        /* rv */ std::ffi::c_int,
437        /* pipefd */ [std::ffi::c_int; 2],
438        /* flags */ linux_api::fcntl::OFlag,
439    );
440    pub fn pipe2(
441        ctx: &mut SyscallContext,
442        fd_ptr: ForeignPtr<[std::ffi::c_int; 2]>,
443        flags: std::ffi::c_int,
444    ) -> Result<(), SyscallError> {
445        Self::pipe_helper(ctx, fd_ptr, flags)
446    }
447
448    fn pipe_helper(
449        ctx: &mut SyscallContext,
450        fd_ptr: ForeignPtr<[std::ffi::c_int; 2]>,
451        flags: i32,
452    ) -> Result<(), SyscallError> {
453        // make sure they didn't pass a NULL pointer
454        if fd_ptr.is_null() {
455            return Err(linux_api::errno::Errno::EFAULT.into());
456        }
457
458        let Some(flags) = OFlag::from_bits(flags) else {
459            debug!("Invalid flags: {flags}");
460            return Err(Errno::EINVAL.into());
461        };
462
463        let mut file_flags = FileStatus::empty();
464        let mut descriptor_flags = DescriptorFlags::empty();
465
466        for flag in flags.iter() {
467            match flag {
468                OFlag::O_NONBLOCK => file_flags.insert(FileStatus::NONBLOCK),
469                OFlag::O_DIRECT => file_flags.insert(FileStatus::DIRECT),
470                OFlag::O_CLOEXEC => descriptor_flags.insert(DescriptorFlags::FD_CLOEXEC),
471                x if x == OFlag::empty() => {
472                    // The "empty" flag is always present. Ignore.
473                }
474                unhandled => {
475                    // TODO: return an error and change this to `warn_once_then_debug`?
476                    warn!("Ignoring pipe flag {unhandled:?}");
477                }
478            }
479        }
480
481        // reference-counted buffer for the pipe
482        let buffer = SharedBuf::new(c::CONFIG_PIPE_BUFFER_SIZE.try_into().unwrap());
483        let buffer = Arc::new(AtomicRefCell::new(buffer));
484
485        // reference-counted file object for read end of the pipe
486        let reader = pipe::Pipe::new(FileMode::READ, file_flags);
487        let reader = Arc::new(AtomicRefCell::new(reader));
488
489        // reference-counted file object for write end of the pipe
490        let writer = pipe::Pipe::new(FileMode::WRITE, file_flags);
491        let writer = Arc::new(AtomicRefCell::new(writer));
492
493        // set the file objects to listen for events on the buffer
494        CallbackQueue::queue_and_run_with_legacy(|cb_queue| {
495            pipe::Pipe::connect_to_buffer(&reader, Arc::clone(&buffer), cb_queue);
496            pipe::Pipe::connect_to_buffer(&writer, Arc::clone(&buffer), cb_queue);
497        });
498
499        // file descriptors for the read and write file objects
500        let mut reader_desc = Descriptor::new(CompatFile::New(OpenFile::new(File::Pipe(reader))));
501        let mut writer_desc = Descriptor::new(CompatFile::New(OpenFile::new(File::Pipe(writer))));
502
503        // set the file descriptor flags
504        reader_desc.set_flags(descriptor_flags);
505        writer_desc.set_flags(descriptor_flags);
506
507        // register the file descriptors
508        let mut dt = ctx.objs.thread.descriptor_table_borrow_mut(ctx.objs.host);
509        // unwrap here since the error handling would be messy (need to deregister) and we shouldn't
510        // ever need to worry about this in practice
511        let read_fd = dt.register_descriptor(reader_desc).unwrap();
512        let write_fd = dt.register_descriptor(writer_desc).unwrap();
513
514        // try to write them to the caller
515        let fds = [i32::from(read_fd), i32::from(write_fd)];
516        let write_res = ctx.objs.process.memory_borrow_mut().write(fd_ptr, &fds);
517
518        // clean up in case of error
519        match write_res {
520            Ok(_) => Ok(()),
521            Err(e) => {
522                CallbackQueue::queue_and_run_with_legacy(|cb_queue| {
523                    // ignore any errors when closing
524                    dt.deregister_descriptor(read_fd)
525                        .unwrap()
526                        .close(ctx.objs.host, cb_queue);
527                    dt.deregister_descriptor(write_fd)
528                        .unwrap()
529                        .close(ctx.objs.host, cb_queue);
530                });
531                Err(e.into())
532            }
533        }
534    }
535
536    log_syscall!(getpid, /* rv */ linux_api::posix_types::kernel_pid_t);
537    pub fn getpid(ctx: &mut SyscallContext) -> Result<kernel_pid_t, SyscallError> {
538        Ok(ctx.objs.process.id().into())
539    }
540
541    log_syscall!(getppid, /* rv */ linux_api::posix_types::kernel_pid_t);
542    pub fn getppid(ctx: &mut SyscallContext) -> Result<kernel_pid_t, SyscallError> {
543        Ok(ctx.objs.process.parent_id().into())
544    }
545
546    log_syscall!(getpgrp, /* rv */ kernel_pid_t);
547    pub fn getpgrp(ctx: &mut SyscallContext) -> Result<kernel_pid_t, SyscallError> {
548        Ok(ctx.objs.process.group_id().into())
549    }
550
551    log_syscall!(
552        getpgid,
553        /* rv */ kernel_pid_t,
554        /* pid*/ kernel_pid_t,
555    );
556    pub fn getpgid(
557        ctx: &mut SyscallContext,
558        pid: kernel_pid_t,
559    ) -> Result<kernel_pid_t, SyscallError> {
560        if pid == 0 || pid == kernel_pid_t::from(ctx.objs.process.id()) {
561            return Ok(ctx.objs.process.group_id().into());
562        }
563        let pid = ProcessId::try_from(pid).map_err(|_| Errno::EINVAL)?;
564        let Some(process) = ctx.objs.host.process_borrow(pid) else {
565            return Err(Errno::ESRCH.into());
566        };
567        let process = process.borrow(ctx.objs.host.root());
568        Ok(process.group_id().into())
569    }
570
571    log_syscall!(
572        setpgid,
573        /* rv */ std::ffi::c_int,
574        /* pid */ kernel_pid_t,
575        /* pgid */ kernel_pid_t,
576    );
577    pub fn setpgid(
578        ctx: &mut SyscallContext,
579        pid: kernel_pid_t,
580        pgid: kernel_pid_t,
581    ) -> Result<(), SyscallError> {
582        let _processrc_borrow;
583        let _process_borrow;
584        let process: &Process;
585        if pid == 0 || pid == kernel_pid_t::from(ctx.objs.process.id()) {
586            _processrc_borrow = None;
587            _process_borrow = None;
588            process = ctx.objs.process;
589        } else {
590            let pid = ProcessId::try_from(pid).map_err(|_| Errno::EINVAL)?;
591            let Some(pbrc) = ctx.objs.host.process_borrow(pid) else {
592                return Err(Errno::ESRCH.into());
593            };
594            _processrc_borrow = Some(pbrc);
595            _process_borrow = Some(
596                _processrc_borrow
597                    .as_ref()
598                    .unwrap()
599                    .borrow(ctx.objs.host.root()),
600            );
601            process = _process_borrow.as_ref().unwrap();
602        }
603        let pgid = if pgid == 0 {
604            None
605        } else {
606            Some(ProcessId::try_from(pgid).map_err(|_| Errno::EINVAL)?)
607        };
608        if process.id() != ctx.objs.process.id() && process.parent_id() != ctx.objs.process.id() {
609            // `setpgid(2)`: pid is not the calling process and not a child  of
610            // the calling process.
611            return Err(Errno::ESRCH.into());
612        }
613        if let Some(pgid) = pgid {
614            if ctx.objs.host.process_session_id_of_group_id(pgid) != Some(process.session_id()) {
615                // An attempt was made to move a process into a process group in
616                // a different session
617                return Err(Errno::EPERM.into());
618            }
619        }
620        if process.session_id() != ctx.objs.process.session_id() {
621            // `setpgid(2)`: ... or to change the process  group  ID of one of
622            // the children of the calling process and the child was in a
623            // different session
624            return Err(Errno::EPERM.into());
625        }
626        if process.session_id() == process.id() {
627            // `setpgid(2)`: ... or to change the process group ID of a session leader
628            return Err(Errno::EPERM.into());
629        }
630        // TODO: Keep track of whether a process has performed an `execve`.
631        // `setpgid(2): EACCES: An attempt was made to change the process group
632        // ID of one of the children of the calling process and the child had
633        // already performed an execve(2).
634        if let Some(pgid) = pgid {
635            if ctx.objs.host.process_session_id_of_group_id(pgid) != Some(process.session_id()) {
636                // `setpgid(2)`: An attempt was made to move a process into a
637                // process group in a different session
638                return Err(Errno::EPERM.into());
639            }
640            process.set_group_id(pgid);
641        } else {
642            // `setpgid(2)`: If pgid is zero, then the PGID of the process
643            // specified by pid is made the same as its process ID.
644            process.set_group_id(process.id());
645        }
646        Ok(())
647    }
648
649    log_syscall!(
650        getsid,
651        /* rv */ kernel_pid_t,
652        /* pid */ kernel_pid_t,
653    );
654    pub fn getsid(
655        ctx: &mut SyscallContext,
656        pid: kernel_pid_t,
657    ) -> Result<kernel_pid_t, SyscallError> {
658        if pid == 0 {
659            return Ok(ctx.objs.process.session_id().into());
660        }
661        let Ok(pid) = ProcessId::try_from(pid) else {
662            return Err(Errno::EINVAL.into());
663        };
664        let Some(processrc) = ctx.objs.host.process_borrow(pid) else {
665            return Err(Errno::ESRCH.into());
666        };
667        let process = processrc.borrow(ctx.objs.host.root());
668        // No need to check that process is in the same session:
669        //
670        // `getsid(2)`: A process with process ID pid exists, but it is not in
671        // the same session as the calling process, and the implementation
672        // considers this an error... **Linux does not return EPERM**.
673
674        Ok(process.session_id().into())
675    }
676
677    log_syscall!(setsid, /* rv */ kernel_pid_t);
678    pub fn setsid(ctx: &mut SyscallContext) -> Result<kernel_pid_t, SyscallError> {
679        let pid = ctx.objs.process.id();
680        if ctx.objs.host.process_session_id_of_group_id(pid).is_some() {
681            // `setsid(2)`: The process group ID of any process equals the PID
682            // of the calling process.  Thus, in particular, setsid() fails if
683            // the calling process is already a process group leader.
684            return Err(Errno::EPERM.into());
685        }
686
687        // `setsid(2)`: The calling process is the leader of the new session
688        // (i.e., its session ID is made the same as its process ID).
689        ctx.objs.process.set_session_id(pid);
690
691        // `setsid(2)`: The calling  process  also  becomes  the  process group
692        // leader of a new process group in the session (i.e., its process group
693        // ID is made the same as its process ID).
694        ctx.objs.process.set_group_id(pid);
695
696        Ok(pid.into())
697    }
698
699    fn execve_common(
700        ctx: &mut SyscallContext,
701        base_dir: &CStr,
702        path: &CStr,
703        argv_ptr_ptr: ForeignPtr<ForeignPtr<std::ffi::c_char>>,
704        envv_ptr_ptr: ForeignPtr<ForeignPtr<std::ffi::c_char>>,
705        _flags: std::ffi::c_int,
706    ) -> Result<(), SyscallError> {
707        if path.is_empty() {
708            // execve(2): The file pathname or a script or ELF interpreter does not exist.
709            return Err(Errno::ENOENT.into());
710        }
711
712        let path_bytes_with_nul = path.to_bytes_with_nul();
713
714        let _abs_path_storage: Option<CString>;
715        let abs_path: &CStr;
716        if path_bytes_with_nul[0] != b'/' {
717            let base_dir_bytes = base_dir.to_bytes();
718
719            // Maybe TODO: this could be done in place without allocating
720            // and with less copying (but more fiddly and error-prone).
721            let mut tmp = Vec::with_capacity(
722                base_dir_bytes.len() + path_bytes_with_nul.len() + /*separator*/1,
723            );
724            tmp.extend(base_dir_bytes);
725            tmp.push(b'/');
726            tmp.extend(path_bytes_with_nul);
727
728            _abs_path_storage = Some(CString::from_vec_with_nul(tmp).unwrap());
729            abs_path = _abs_path_storage.as_ref().unwrap();
730        } else {
731            _abs_path_storage = None;
732            abs_path = path;
733        }
734
735        // TODO: canonicalize? On one hand that would improve caching behavior
736        // in `verify_plugin_path`; OTOH it does some redundant work with
737        // `verify_plugin_path`. Ideal solution is probably to split up
738        // `verify_plugin_path` a bit.
739
740        // `execve(2)`: Most UNIX implementations impose some limit on the
741        // total size of the command-line  argument  (argv)  and
742        // environment  (envp) strings that may be passed to a new program.
743        // POSIX.1 allows an implementation to advertise this limit using
744        // the ARG_MAX constant
745
746        let argv;
747        let envv;
748        {
749            let mem = ctx.objs.process.memory_borrow();
750            argv = read_cstring_vec(&mem, argv_ptr_ptr)?;
751            envv = read_cstring_vec(&mem, envv_ptr_ptr)?;
752        }
753
754        let mthread = ctx
755            .objs
756            .process
757            .borrow_as_runnable()
758            .unwrap()
759            .spawn_mthread_for_exec(ctx.objs.host, abs_path, argv, envv)?;
760
761        // If we get this far, then we should be able to ultimately succeed.
762        // We need a mutable reference to the Process to update it, though, which we can't
763        // get from here since it's already borrowed immutably.
764        //
765        // So, we return a "blocking" result from this syscall handler, and
766        // schedule an event to update the `Process` and resume execution.
767        //
768        // It's possible that other events may affect the `Process` before this one runs.
769        // We try to handle this gracefully; e.g. if the `Process` has exited before this
770        // event runs, we kill and drop the exec'd `ManagedThread` and carry on.
771        //
772        // TODO: There may be other interactions that aren't handled correctly.
773        // e.g. if the exec'ing thread ends up handling a signal in the meantime.
774        // * We could add a new state "`Execing`" to `Process`, and force any
775        // such events to decide how to deal with it. e.g. signal delivery
776        // events could reschedule themselves to run after the exec has
777        // completed. This seems a bit heavy-weight, though.
778        // * We could add more interior mutability s.t. we don't need mutable
779        // references to the Thread and Process in order to do the necessary
780        // updates. This is a fair bit of extra interior mutability to add
781        // though, and has a side-effect of further complicating read-accesses
782        // to items that are read-mostly.
783        // * We could arrange for syscall handlers to get or be able to get
784        // mutable references to the Thread and Process, so that we can complete
785        // the updates synchronously here. This is currently blocked by the
786        // usage of `worker_getCurrentProcess` and `worker_getCurrentThread`,
787        // which will panic with incompatible borrow errors if those are
788        // borrowed mutably.  There aren't many references left to those though,
789        // maybe we can eliminate them.
790        {
791            let pid = ctx.objs.process.id();
792            let tid = ctx.objs.thread.id();
793
794            // Tasks are currently required to be `Sync` and to implement `Fn`, not just `FnOnce`.
795            // Since `mthread` isn't `Sync`, we need to wrap it in a `RootedRefCell`.
796            // Since we need to consume it, we need to also wrap it in an
797            // `Option` and fail at runtime if this actually gets executed
798            // multiple times.
799            // TODO: Split TaskRef into another type that only requires `FnOnce` and `Send`.
800            let mthread = RootedRefCell::new(ctx.objs.host.root(), Some(mthread));
801            ctx.objs.host.schedule_task_with_delay(
802                TaskRef::new(move |host| {
803                    // Take the `mthread` out of the captured wrapper.
804                    // This task shouldn't run multiple times, so this should be
805                    // infallible.
806                    let mthread = mthread.borrow_mut(host.root()).take().unwrap();
807                    // The exec'ing thread's ID is changed to match the pid, since it's
808                    // the new thread-group-leader.
809                    let new_tglid = {
810                        let Some(processrc) = host.process_borrow(pid) else {
811                            // Can happen if another event runs before this one
812                            // and causes the Process to exit (e.g. exit_group
813                            // called from anothe Thread).
814                            log::debug!("Process {pid:?} disappeared before exec could complete");
815                            mthread.kill_and_drop();
816                            return;
817                        };
818                        Worker::set_active_process(&processrc);
819                        let mut process = processrc.borrow_mut(host.root());
820                        process.update_for_exec(host, tid, mthread);
821                        Worker::clear_active_process();
822                        process.thread_group_leader_id()
823                    };
824                    host.resume(pid, new_tglid);
825                }),
826                SimulationTime::ZERO,
827            );
828        }
829
830        Err(SyscallError::new_blocked_until(EmulatedTime::MAX, false))
831    }
832
833    log_syscall!(
834        execve,
835        /* rv */ i32,
836        /* pathname */ SyscallStringArg,
837        /* argv */ *const std::ffi::c_void,
838        /* envp */ *const std::ffi::c_void,
839    );
840    pub fn execve(
841        ctx: &mut SyscallContext,
842        pathname: ForeignPtr<std::ffi::c_char>,
843        argv: ForeignPtr<ForeignPtr<std::ffi::c_char>>,
844        envp: ForeignPtr<ForeignPtr<std::ffi::c_char>>,
845    ) -> Result<i64, SyscallError> {
846        let mut path_buf = [0u8; linux_api::limits::PATH_MAX];
847        let path_buf_capacity = path_buf.len();
848        let path = ctx.objs.process.memory_borrow().copy_str_from_ptr(
849            &mut path_buf,
850            ForeignArrayPtr::new(pathname.cast::<u8>(), path_buf_capacity),
851        )?;
852
853        Self::execve_common(
854            ctx,
855            &ctx.objs.process.current_working_dir(),
856            path,
857            argv,
858            envp,
859            0,
860        )
861        .map(|_| 0)
862    }
863
864    log_syscall!(
865        execveat,
866        /* rv */ i32,
867        /* dirfd */ std::ffi::c_int,
868        /* pathname */ SyscallStringArg,
869        /* argv */ *const std::ffi::c_void,
870        /* envp */ *const std::ffi::c_void,
871        /* flags */ std::ffi::c_int,
872    );
873    pub fn execveat(
874        _ctx: &mut SyscallContext,
875        _dirfd: std::ffi::c_int,
876        _pathname: ForeignPtr<std::ffi::c_char>,
877        _argv: ForeignPtr<ForeignPtr<std::ffi::c_char>>,
878        _envp: ForeignPtr<ForeignPtr<std::ffi::c_char>>,
879        _flags: std::ffi::c_int,
880    ) -> Result<i64, SyscallError> {
881        // TODO: Implement resolution of the path to the executable,
882        // and then call `execve_common` with that.
883        Err(Errno::ENOSYS.into())
884    }
885
886    log_syscall!(
887        exit_group,
888        /* rv */ std::ffi::c_int,
889        /* error_code */ std::ffi::c_int,
890    );
891    pub fn exit_group(
892        _ctx: &mut SyscallContext,
893        error_code: std::ffi::c_int,
894    ) -> Result<(), SyscallError> {
895        log::trace!("Exit group with exit code {error_code}");
896        Err(SyscallError::Native)
897    }
898
899    log_syscall!(
900        set_tid_address,
901        /* rv */ linux_api::posix_types::kernel_pid_t,
902        /* tidptr */ *const std::ffi::c_int,
903    );
904    pub fn set_tid_address(
905        ctx: &mut SyscallContext,
906        tid_ptr: ForeignPtr<std::ffi::c_int>,
907    ) -> Result<kernel_pid_t, SyscallError> {
908        ctx.objs
909            .thread
910            .set_tid_address(tid_ptr.cast::<libc::pid_t>());
911        Ok(ctx.objs.thread.id().into())
912    }
913
914    log_syscall!(
915        uname,
916        /* rv */ std::ffi::c_int,
917        /* name */ *const std::ffi::c_void,
918    );
919    pub fn uname(
920        ctx: &mut SyscallContext,
921        name_ptr: ForeignPtr<linux_api::utsname::new_utsname>,
922    ) -> Result<(), SyscallError> {
923        // NOTE: On linux x86-64, `SYS_uname` corresponds with `__NR_uname` which calls
924        // `sys_newuname` and not `sys_uname`. The correct mapping is:
925        //
926        // - __NR_oldolduname -> sys_olduname
927        // - __NR_olduname -> sys_uname
928        // - __NR_uname -> sys_newuname
929        //
930        // Some online resources such as the chromium syscall table are incorrect.
931
932        let mut name: linux_api::utsname::new_utsname = shadow_pod::zeroed();
933
934        let nodename = u8_to_i8_slice(ctx.objs.host.info().name.as_bytes());
935
936        // Currently hardcoded with values reported in Debian 12
937        let sysname = u8_to_i8_slice(&b"Linux"[..]);
938        let release = u8_to_i8_slice(&b"6.1.0-25-amd64"[..]);
939        let version = u8_to_i8_slice(&b"#1 SMP PREEMPT_DYNAMIC Debian 6.1.106-3 (2024-08-26)"[..]);
940        let machine = u8_to_i8_slice(&b"x86_64"[..]);
941
942        name.sysname[..sysname.len()].copy_from_slice(sysname);
943        name.nodename[..nodename.len()].copy_from_slice(nodename);
944        name.release[..release.len()].copy_from_slice(release);
945        name.version[..version.len()].copy_from_slice(version);
946        name.machine[..machine.len()].copy_from_slice(machine);
947
948        ctx.objs
949            .process
950            .memory_borrow_mut()
951            .write(name_ptr, &name)?;
952
953        Ok(())
954    }
955
956    log_syscall!(
957        chdir,
958        /* rv */ std::ffi::c_int,
959        /* path */ SyscallStringArg,
960    );
961    pub fn chdir(
962        ctx: &mut SyscallContext,
963        path: ForeignPtr<std::ffi::c_char>,
964    ) -> Result<(), SyscallError> {
965        // The native working directory must match the emulated one
966        // <https://github.com/shadow/shadow/issues/2960>. First execute the
967        // native chdir, propagating any failures.
968        let (process, thread) = ctx.objs.split_thread();
969        thread.native_chdir(&process, path)?;
970
971        // Update our internal copy of the cwd.
972        //
973        // We could try to work it out ourselves based on the previous cwd and
974        // the path we were passed, but this seems a bit tricky and error-prone.
975        //
976        // We could have the managed thread execute a native `getcwd`, but we'd
977        // also need to have it allocate and free memory to use with it, making
978        // this a bit complex and high overhead.
979        //
980        // Instead we use the proc file system. `/proc/<pid>/cwd` should be a
981        // symbolic link to the actual working dir we just set.
982        let procpath = format!("/proc/{}/cwd", thread.native_tid().as_raw_nonzero().get());
983        let newcwd = std::fs::read_link(&procpath)
984            .unwrap_or_else(|e| panic!("Couldn't find new cwd {procpath}: {e:?}"));
985        let mut newcwd = newcwd.into_os_string().into_vec();
986        newcwd.push(0);
987        let newcwd = CString::from_vec_with_nul(newcwd).unwrap();
988        process.process.set_current_working_dir(newcwd);
989        Ok(())
990    }
991}