Skip to main content

shadow_rs/host/syscall/handler/
sched.rs

1use std::mem::MaybeUninit;
2
3use bitflags::Flags;
4use linux_api::errno::Errno;
5use linux_api::posix_types::kernel_pid_t;
6use linux_api::rseq::{rseq, rseq_flags};
7use linux_api::sched::{SCHED_RESET_ON_FORK, Sched, sched_attr};
8use log::warn;
9use shadow_shim_helper_rs::explicit_drop::{ExplicitDrop, ExplicitDropper};
10use shadow_shim_helper_rs::syscall_types::ForeignPtr;
11
12use crate::host::syscall::handler::{SyscallContext, SyscallHandler};
13use crate::host::syscall::type_formatting::SyscallNonDeterministicArg;
14use crate::host::syscall::types::ForeignArrayPtr;
15use crate::host::thread::{Thread, ThreadId};
16
17// We always report that the thread is running on CPU 0, Node 0
18const CURRENT_CPU: u32 = 0;
19
20/// Run `f` on the thread targeted by the scheduler syscalls.
21///
22/// Linux treats a tid of 0 as the current thread for these calls. For any other tid, borrow the
23/// emulated thread through a cloned `RootedRc` and explicitly drop that clone before returning.
24fn with_sched_target_thread<T>(
25    ctx: &SyscallContext,
26    tid: kernel_pid_t,
27    f: impl FnOnce(&Thread) -> T,
28) -> Result<T, Errno> {
29    let current_tid = kernel_pid_t::from(ctx.objs.thread.id());
30    if tid == 0 || tid == current_tid {
31        return Ok(f(ctx.objs.thread));
32    }
33
34    let target_tid = ThreadId::try_from(tid).or(Err(Errno::ESRCH))?;
35    let Some(thread_rc) = ctx.objs.host.thread_cloned_rc(target_tid) else {
36        return Err(Errno::ESRCH);
37    };
38    let thread_rc =
39        ExplicitDropper::new(thread_rc, |value| value.explicit_drop(ctx.objs.host.root()));
40    let thread = thread_rc.borrow(ctx.objs.host.root());
41    Ok(f(&thread))
42}
43
44/// Validate the priority rules for the scheduler policies Shadow tracks.
45///
46/// Non-realtime policies use a fixed priority of 0, while Linux accepts priorities 1 through 99
47/// for `SCHED_FIFO` and `SCHED_RR`.
48fn validate_sched_attrs(policy: Sched, priority: std::ffi::c_int) -> Result<(), Errno> {
49    match policy {
50        Sched::SCHED_NORMAL | Sched::SCHED_BATCH | Sched::SCHED_IDLE if priority == 0 => Ok(()),
51        Sched::SCHED_FIFO | Sched::SCHED_RR if (1..=99).contains(&priority) => Ok(()),
52        _ => Err(Errno::EINVAL),
53    }
54}
55
56impl SyscallHandler {
57    log_syscall!(
58        sched_getaffinity,
59        /* rv */ i32,
60        // Non-deterministic due to https://github.com/shadow/shadow/issues/3626
61        /* pid */
62        SyscallNonDeterministicArg<kernel_pid_t>,
63        /* cpusetsize */ usize,
64        /* mask */ *const std::ffi::c_void,
65    );
66    pub fn sched_getaffinity(
67        ctx: &mut SyscallContext,
68        tid: kernel_pid_t,
69        cpusetsize: usize,
70        // sched_getaffinity(2):
71        // > The underlying system calls (which represent CPU masks as bit masks
72        // > of type unsigned long *) impose no restriction on the size of the CPU
73        // > mask
74        mask_ptr: ForeignPtr<std::ffi::c_ulong>,
75    ) -> Result<std::ffi::c_int, Errno> {
76        let mask_ptr = mask_ptr.cast::<u8>();
77        let mask_ptr = ForeignArrayPtr::new(mask_ptr, cpusetsize);
78
79        let tid = ThreadId::try_from(tid).or(Err(Errno::ESRCH))?;
80        if !ctx.objs.host.has_thread(tid) && kernel_pid_t::from(tid) != 0 {
81            return Err(Errno::ESRCH);
82        }
83
84        // Shadow doesn't have users, so no need to check for permissions
85
86        if cpusetsize == 0 {
87            return Err(Errno::EINVAL);
88        }
89
90        let mut mem = ctx.objs.process.memory_borrow_mut();
91        let mut mask = mem.memory_ref_mut(mask_ptr)?;
92
93        // this assumes little endian
94        let bytes_written = 1;
95        mask[0] = 1;
96
97        mask.flush()?;
98
99        Ok(bytes_written)
100    }
101
102    log_syscall!(
103        sched_setaffinity,
104        /* rv */ i32,
105        /* pid */ kernel_pid_t,
106        /* cpusetsize */ usize,
107        /* mask */ *const std::ffi::c_void,
108    );
109    pub fn sched_setaffinity(
110        ctx: &mut SyscallContext,
111        tid: kernel_pid_t,
112        cpusetsize: usize,
113        // sched_getaffinity(2):
114        // > The underlying system calls (which represent CPU masks as bit masks
115        // > of type unsigned long *) impose no restriction on the size of the CPU
116        // > mask
117        mask_ptr: ForeignPtr<std::ffi::c_ulong>,
118    ) -> Result<(), Errno> {
119        let mask_ptr = mask_ptr.cast::<u8>();
120        let mask_ptr = ForeignArrayPtr::new(mask_ptr, cpusetsize);
121
122        let tid = ThreadId::try_from(tid).or(Err(Errno::ESRCH))?;
123        if !ctx.objs.host.has_thread(tid) && kernel_pid_t::from(tid) != 0 {
124            return Err(Errno::ESRCH);
125        };
126
127        // Shadow doesn't have users, so no need to check for permissions
128
129        if cpusetsize == 0 {
130            return Err(Errno::EINVAL);
131        }
132
133        let mem = ctx.objs.process.memory_borrow_mut();
134        let mask = mem.memory_ref(mask_ptr)?;
135
136        // this assumes little endian
137        if mask[0] & 0x01 == 0 {
138            return Err(Errno::EINVAL);
139        }
140
141        Ok(())
142    }
143
144    log_syscall!(
145        sched_getparam,
146        /* rv */ i32,
147        /* pid */ kernel_pid_t,
148        /* param */ *const linux_api::sched::sched_attr,
149    );
150    pub fn sched_getparam(
151        ctx: &mut SyscallContext,
152        tid: kernel_pid_t,
153        param_ptr: ForeignPtr<sched_attr>,
154    ) -> Result<(), Errno> {
155        warn_once_then_debug!(
156            "sched_getparam() only returns tracked scheduler state; Shadow does not emulate Linux scheduling behavior"
157        );
158
159        let priority = with_sched_target_thread(ctx, tid, |thread| thread.sched_priority())?;
160        ctx.objs
161            .process
162            .memory_borrow_mut()
163            .write(param_ptr.cast::<std::ffi::c_int>(), &priority)?;
164
165        Ok(())
166    }
167
168    log_syscall!(
169        sched_getscheduler,
170        /* rv */ i32,
171        /* pid */ kernel_pid_t,
172    );
173    pub fn sched_getscheduler(
174        ctx: &mut SyscallContext,
175        tid: kernel_pid_t,
176    ) -> Result<std::ffi::c_int, Errno> {
177        warn_once_then_debug!(
178            "sched_getscheduler() only returns tracked scheduler state; Shadow does not emulate Linux scheduling behavior"
179        );
180
181        with_sched_target_thread(ctx, tid, |thread| {
182            let mut policy = i32::from(thread.sched_policy());
183            if thread.sched_reset_on_fork() {
184                policy |= SCHED_RESET_ON_FORK;
185            }
186            policy
187        })
188    }
189
190    log_syscall!(
191        sched_setparam,
192        /* rv */ i32,
193        /* pid */ kernel_pid_t,
194        /* param */ *const linux_api::sched::sched_attr,
195    );
196    pub fn sched_setparam(
197        ctx: &mut SyscallContext,
198        tid: kernel_pid_t,
199        param_ptr: ForeignPtr<sched_attr>,
200    ) -> Result<(), Errno> {
201        warn_once_then_debug!(
202            "sched_setparam() only updates tracked scheduler state; Shadow does not emulate Linux scheduling behavior"
203        );
204
205        let new_priority = ctx
206            .objs
207            .process
208            .memory_borrow()
209            .read(param_ptr.cast::<std::ffi::c_int>())?;
210        let (policy, reset_on_fork) = with_sched_target_thread(ctx, tid, |thread| {
211            (thread.sched_policy(), thread.sched_reset_on_fork())
212        })?;
213
214        validate_sched_attrs(policy, new_priority)?;
215
216        with_sched_target_thread(ctx, tid, |thread| {
217            thread.set_sched_attrs(policy, reset_on_fork, new_priority);
218        })?;
219
220        Ok(())
221    }
222
223    log_syscall!(
224        sched_setscheduler,
225        /* rv */ i32,
226        /* pid */ kernel_pid_t,
227        /* policy */ std::ffi::c_int,
228        /* param */ *const linux_api::sched::sched_attr,
229    );
230    pub fn sched_setscheduler(
231        ctx: &mut SyscallContext,
232        tid: kernel_pid_t,
233        policy: std::ffi::c_int,
234        param_ptr: ForeignPtr<sched_attr>,
235    ) -> Result<(), Errno> {
236        warn_once_then_debug!(
237            "sched_setscheduler() only updates tracked scheduler state; Shadow does not emulate Linux scheduling behavior"
238        );
239
240        let new_priority = ctx
241            .objs
242            .process
243            .memory_borrow()
244            .read(param_ptr.cast::<std::ffi::c_int>())?;
245        let reset_on_fork = (policy & SCHED_RESET_ON_FORK) != 0;
246        let policy = Sched::try_from(policy & !SCHED_RESET_ON_FORK).or(Err(Errno::EINVAL))?;
247        let policy = match policy {
248            Sched::SCHED_NORMAL
249            | Sched::SCHED_FIFO
250            | Sched::SCHED_RR
251            | Sched::SCHED_BATCH
252            | Sched::SCHED_IDLE => policy,
253            Sched::SCHED_DEADLINE => {
254                warn_once_then_debug!(
255                    "sched_setscheduler() rejects SCHED_DEADLINE because Shadow does not implement deadline scheduling semantics"
256                );
257                return Err(Errno::EINVAL);
258            }
259            Sched::SCHED_EXT => {
260                warn_once_then_debug!(
261                    "sched_setscheduler() rejects SCHED_EXT because Shadow does not implement BPF-defined scheduler semantics"
262                );
263                return Err(Errno::EINVAL);
264            }
265        };
266        validate_sched_attrs(policy, new_priority)?;
267
268        with_sched_target_thread(ctx, tid, |thread| {
269            thread.set_sched_attrs(policy, reset_on_fork, new_priority);
270        })?;
271
272        Ok(())
273    }
274
275    log_syscall!(
276        rseq,
277        /* rv */ i32,
278        /* rseq */ *const std::ffi::c_void,
279        /* rseq_len */ u32,
280        /* flags */ i32,
281        /* sig */ u32,
282    );
283    pub fn rseq(
284        ctx: &mut SyscallContext,
285        rseq_ptr: ForeignPtr<MaybeUninit<u8>>,
286        rseq_len: u32,
287        flags: std::ffi::c_int,
288        _sig: u32,
289    ) -> Result<(), Errno> {
290        // we won't need more bytes than the size of the `rseq` struct
291        let rseq_len = rseq_len.try_into().unwrap();
292        let rseq_len = std::cmp::min(rseq_len, std::mem::size_of::<rseq>());
293
294        let flags = rseq_flags::from_bits_retain(flags);
295        let unknown_flags = flags.unknown_bits();
296
297        if unknown_flags != 0 {
298            warn!("Unrecognized rseq flags: 0x{unknown_flags:x} in {flags:?}");
299            return Err(Errno::EINVAL);
300        }
301        if flags.contains(rseq_flags::RSEQ_FLAG_UNREGISTER) {
302            // TODO:
303            // * Validate that an rseq was previously registered
304            // * Validate that `sig` matches registration
305            // * Set the cpu_id of the previously registerd rseq to the uninitialized
306            //   state.
307            return Ok(());
308        }
309
310        // The `rseq` struct is designed to grow as linux needs to add more features, so we can't
311        // assume that the application making the rseq syscall is using the exact same struct as we
312        // have available in the linux_api crate (the calling application's rseq struct may have
313        // more or fewer fields). Furthermore, the rseq struct ends with a "flexible array member",
314        // which means that the rseq struct cannot be `Copy` and therefore not `Pod`.
315        //
316        // Instead, we should treat the rseq struct as a bunch of bytes and write to individual
317        // fields if possible without making assumptions about the size of the data.
318        let mut mem = ctx.objs.process.memory_borrow_mut();
319        let mut rseq_mem = mem.memory_ref_mut(ForeignArrayPtr::new(rseq_ptr, rseq_len))?;
320        let rseq_bytes = &mut *rseq_mem;
321
322        // rseq is mostly unimplemented, but also mostly unneeded in Shadow.
323        // We'd only need to implement the "real" functionality if we ever implement
324        // true preemption, in which case we'd need to do something if we ever pre-empted
325        // while the user code was in a restartable sequence. As it is, Shadow only
326        // reschedules threads at system calls, and system calls are disallowed inside
327        // restartable sequences.
328        //
329        // TODO: One place where Shadow might need to implement rseq recovery is
330        // if a hardware-based signal is delivered in the middle of an
331        // interruptible sequence.  e.g. the code in the rseq accesses an
332        // invalid address, raising SIGSEGV, but then catching it and recovering
333        // in a handler.
334        // https://github.com/shadow/shadow/issues/2139
335        //
336        // For now we just update to reflect that the thread is running on CPU 0.
337
338        let Some((cpu_id, cpu_id_start)) = field_project!(rseq_bytes, rseq, (cpu_id, cpu_id_start))
339        else {
340            return Err(Errno::EINVAL);
341        };
342
343        cpu_id.write(CURRENT_CPU);
344        cpu_id_start.write(CURRENT_CPU);
345
346        rseq_mem.flush()?;
347
348        Ok(())
349    }
350}