shadow_rs/host/syscall/handler/
epoll.rs

1use std::ops::DerefMut;
2use std::sync::Arc;
3
4use linux_api::epoll::{EpollCreateFlags, EpollCtlOp, EpollEvents};
5use linux_api::errno::Errno;
6use linux_api::fcntl::DescriptorFlags;
7use shadow_shim_helper_rs::simulation_time::SimulationTime;
8use shadow_shim_helper_rs::syscall_types::ForeignPtr;
9
10use crate::core::worker::Worker;
11use crate::cshadow;
12use crate::host::descriptor::descriptor_table::DescriptorHandle;
13use crate::host::descriptor::epoll::Epoll;
14use crate::host::descriptor::{CompatFile, Descriptor, File, FileState, OpenFile};
15use crate::host::memory_manager::MemoryManager;
16use crate::host::syscall::handler::{SyscallContext, SyscallHandler};
17use crate::host::syscall::types::{ForeignArrayPtr, SyscallError};
18use crate::utility::callback_queue::CallbackQueue;
19
20impl SyscallHandler {
21    log_syscall!(
22        epoll_create,
23        /* rv */ std::ffi::c_int,
24        /* size */ std::ffi::c_int,
25    );
26    pub fn epoll_create(
27        ctx: &mut SyscallContext,
28        size: std::ffi::c_int,
29    ) -> Result<DescriptorHandle, Errno> {
30        // epoll_create(2): "Since Linux 2.6.8, the size argument is ignored, but must be greater
31        // than zero"
32        if size <= 0 {
33            return Err(Errno::EINVAL);
34        }
35
36        Self::epoll_create_helper(ctx, 0)
37    }
38
39    log_syscall!(
40        epoll_create1,
41        /* rv */ std::ffi::c_int,
42        /* flags */ std::ffi::c_int,
43    );
44    pub fn epoll_create1(
45        ctx: &mut SyscallContext,
46        flags: std::ffi::c_int,
47    ) -> Result<DescriptorHandle, Errno> {
48        Self::epoll_create_helper(ctx, flags)
49    }
50
51    fn epoll_create_helper(
52        ctx: &mut SyscallContext,
53        flags: std::ffi::c_int,
54    ) -> Result<DescriptorHandle, Errno> {
55        // See here for the order that the input args are checked in Linux:
56        // https://github.com/torvalds/linux/blob/2cf0f715623872823a72e451243bbf555d10d032/fs/eventpoll.c#L2030
57        let Some(flags) = EpollCreateFlags::from_bits(flags) else {
58            log::debug!("Invalid epoll_create flags: {flags}");
59            return Err(Errno::EINVAL);
60        };
61
62        let mut desc_flags = DescriptorFlags::empty();
63
64        if flags.contains(EpollCreateFlags::EPOLL_CLOEXEC) {
65            desc_flags.insert(DescriptorFlags::FD_CLOEXEC);
66        }
67
68        let epoll = Epoll::new();
69        let mut desc = Descriptor::new(CompatFile::New(OpenFile::new(File::Epoll(epoll))));
70        desc.set_flags(desc_flags);
71
72        let fd = ctx
73            .objs
74            .thread
75            .descriptor_table_borrow_mut(ctx.objs.host)
76            .register_descriptor(desc)
77            .or(Err(Errno::ENFILE))?;
78
79        log::trace!("Created epoll fd {fd}");
80
81        Ok(fd)
82    }
83
84    log_syscall!(
85        epoll_ctl,
86        /* rv */ std::ffi::c_int,
87        /* epfd */ std::ffi::c_int,
88        /* op */ std::ffi::c_int,
89        /* fd */ std::ffi::c_int,
90        /* event */ *const std::ffi::c_void,
91    );
92    pub fn epoll_ctl(
93        ctx: &mut SyscallContext,
94        epfd: std::ffi::c_int,
95        op: std::ffi::c_int,
96        fd: std::ffi::c_int,
97        event_ptr: ForeignPtr<linux_api::epoll::epoll_event>,
98    ) -> Result<(), Errno> {
99        // See here for the order that the input args are checked in Linux:
100        // https://github.com/torvalds/linux/blob/2cf0f715623872823a72e451243bbf555d10d032/fs/eventpoll.c#L2111
101
102        // We'll need to look up descriptors.
103        let desc_table = ctx.objs.thread.descriptor_table_borrow(ctx.objs.host);
104
105        // Get the epoll descriptor, or return early if it doesn't exist.
106        let (epoll, epoll_canon_handle) = {
107            let desc = Self::get_descriptor(&desc_table, epfd)?;
108
109            let CompatFile::New(epoll) = desc.file() else {
110                return Err(Errno::EINVAL);
111            };
112
113            let epoll_canon_handle = epoll.inner_file().canonical_handle();
114
115            let File::Epoll(epoll) = epoll.inner_file() else {
116                return Err(Errno::EINVAL);
117            };
118
119            (epoll, epoll_canon_handle)
120        };
121
122        // Get the target descriptor, or return errors as appropriate.
123        let target = {
124            let desc = Self::get_descriptor(&desc_table, fd)?;
125
126            // Our epoll implementation only supports adding new Rust descriptor types.
127            // However, the only legacy type remaining in Shadow is a regular file, and
128            // epoll_ctl(2) states that EPERM should be returned for regular files and
129            // other files that don't support epolling.
130            match desc.file() {
131                CompatFile::New(file) => file.inner_file().clone(),
132                CompatFile::Legacy(file) => {
133                    let file_type = unsafe { cshadow::legacyfile_getType(file.ptr()) };
134                    if file_type == cshadow::_LegacyFileType_DT_FILE {
135                        // Epoll doesn't support regular files.
136                        return Err(Errno::EPERM);
137                    } else {
138                        // Our implementation doesn't support other legacy types.
139                        // We don't think we have such types remaining, but warn anyway.
140                        warn_once_then_trace!(
141                            "Attempted to add a legacy file to an epoll file, which \
142                            shadow doesn't support"
143                        );
144                        return Err(Errno::EINVAL);
145                    }
146                }
147            }
148        };
149
150        // An epoll instance is not allowed to monitor itself.
151        if epoll_canon_handle == target.canonical_handle() {
152            return Err(Errno::EINVAL);
153        }
154
155        // Extract the operation.
156        let Ok(op) = EpollCtlOp::try_from(op) else {
157            log::debug!("Invalid epoll op: {op}");
158            return Err(Errno::EINVAL);
159        };
160
161        // Extract the events and data.
162        let (events, data) = if op == EpollCtlOp::EPOLL_CTL_DEL {
163            // epoll_ctl(2): Since Linux 2.6.9, the event pointer is ignored and can be specified as
164            // NULL when using EPOLL_CTL_DEL.
165            (EpollEvents::empty(), 0)
166        } else {
167            let mem = ctx.objs.process.memory_borrow();
168            let ev = mem.read(event_ptr)?;
169
170            let Some(mut events) = EpollEvents::from_bits(ev.events) else {
171                // Braces are needed around `ev.events` for alignment (see rustc --explain E0793).
172                log::debug!("Invalid epoll_ctl events: {}", { ev.events });
173                return Err(Errno::EINVAL);
174            };
175
176            // epoll_ctl(2): epoll always reports for EPOLLERR and EPOLLHUP
177            events.insert(EpollEvents::EPOLLERR | EpollEvents::EPOLLHUP);
178
179            (events, ev.data)
180        };
181
182        log::trace!("Calling epoll_ctl on epoll {epfd} with child {fd}");
183
184        CallbackQueue::queue_and_run_with_legacy(|cb_queue| {
185            let weak_epoll = Arc::downgrade(epoll);
186            epoll
187                .borrow_mut()
188                .ctl(op, fd, target, events, data, weak_epoll, cb_queue)
189        })?;
190        Ok(())
191    }
192
193    log_syscall!(
194        epoll_wait,
195        /* rv */ std::ffi::c_int,
196        /* epfd */ std::ffi::c_int,
197        /* events */ *const std::ffi::c_void,
198        /* max_events */ std::ffi::c_int,
199        /* timeout */ std::ffi::c_int,
200    );
201    pub fn epoll_wait(
202        ctx: &mut SyscallContext,
203        epfd: std::ffi::c_int,
204        events_ptr: ForeignPtr<linux_api::epoll::epoll_event>,
205        max_events: std::ffi::c_int,
206        timeout: std::ffi::c_int,
207    ) -> Result<std::ffi::c_int, SyscallError> {
208        // Note that timeout is given in milliseconds.
209        let timeout = timeout_arg_to_maybe_simtime(timeout)?;
210        Self::epoll_wait_helper(ctx, epfd, events_ptr, max_events, timeout, None)
211    }
212
213    log_syscall!(
214        epoll_pwait,
215        /* rv */ std::ffi::c_int,
216        /* epfd */ std::ffi::c_int,
217        /* events */ *const std::ffi::c_void,
218        /* max_events */ std::ffi::c_int,
219        /* timeout */ std::ffi::c_int,
220        /* sigmask */ *const std::ffi::c_void,
221        /* sigsetsize */ linux_api::posix_types::kernel_size_t,
222    );
223    pub fn epoll_pwait(
224        ctx: &mut SyscallContext,
225        epfd: std::ffi::c_int,
226        events_ptr: ForeignPtr<linux_api::epoll::epoll_event>,
227        max_events: std::ffi::c_int,
228        timeout: std::ffi::c_int,
229        sigmask_ptr: ForeignPtr<linux_api::signal::sigset_t>,
230        _sigsetsize: linux_api::posix_types::kernel_size_t,
231    ) -> Result<std::ffi::c_int, SyscallError> {
232        // epoll_wait(2): "The sigmask argument may be specified as NULL, in which case
233        // epoll_pwait() is equivalent to epoll_wait()"
234        let sigmask = if sigmask_ptr.is_null() {
235            None
236        } else {
237            Some(ctx.objs.process.memory_borrow().read(sigmask_ptr)?)
238        };
239
240        // Note that timeout is given in milliseconds.
241        let timeout = timeout_arg_to_maybe_simtime(timeout)?;
242        Self::epoll_wait_helper(ctx, epfd, events_ptr, max_events, timeout, sigmask)
243    }
244
245    log_syscall!(
246        epoll_pwait2,
247        /* rv */ std::ffi::c_int,
248        /* epfd */ std::ffi::c_int,
249        /* events */ *const std::ffi::c_void,
250        /* max_events */ std::ffi::c_int,
251        /* timeout */ *const std::ffi::c_void,
252        /* sigmask */ *const std::ffi::c_void,
253        /* sigsetsize */ linux_api::posix_types::kernel_size_t,
254    );
255    pub fn epoll_pwait2(
256        ctx: &mut SyscallContext,
257        epfd: std::ffi::c_int,
258        events_ptr: ForeignPtr<linux_api::epoll::epoll_event>,
259        max_events: std::ffi::c_int,
260        timeout_ptr: ForeignPtr<linux_api::time::timespec>,
261        sigmask_ptr: ForeignPtr<linux_api::signal::sigset_t>,
262        _sigsetsize: linux_api::posix_types::kernel_size_t,
263    ) -> Result<std::ffi::c_int, SyscallError> {
264        let (sigmask, timeout) = {
265            let mem = ctx.objs.process.memory_borrow();
266
267            // epoll_wait(2): "The sigmask argument may be specified as NULL, in which case
268            // epoll_pwait() is equivalent to epoll_wait()"
269            let sigmask = if sigmask_ptr.is_null() {
270                None
271            } else {
272                Some(mem.read(sigmask_ptr)?)
273            };
274
275            // epoll_wait(2): "If timeout is NULL, then epoll_pwait2() can block indefinitely"
276            let timeout = if timeout_ptr.is_null() {
277                None
278            } else {
279                let tspec = mem.read(timeout_ptr)?;
280                let sim_time = SimulationTime::try_from(tspec).map_err(|_| Errno::EINVAL)?;
281                Some(sim_time)
282            };
283
284            (sigmask, timeout)
285        };
286
287        Self::epoll_wait_helper(ctx, epfd, events_ptr, max_events, timeout, sigmask)
288    }
289
290    fn epoll_wait_helper(
291        ctx: &mut SyscallContext,
292        epfd: std::ffi::c_int,
293        events_ptr: ForeignPtr<linux_api::epoll::epoll_event>,
294        max_events: std::ffi::c_int,
295        timeout: Option<SimulationTime>,
296        sigmask: Option<linux_api::signal::sigset_t>,
297    ) -> Result<std::ffi::c_int, SyscallError> {
298        // Linux enforces a range for max_events.
299        let max_events = {
300            let upper_bound = epoll_max_events_upper_bound();
301
302            if max_events <= 0 || max_events > upper_bound {
303                log::trace!(
304                    "Epoll maxevents {max_events} is not greater than 0 \
305                            and less than {upper_bound}"
306                );
307                return Err(Errno::EINVAL.into());
308            }
309
310            u32::try_from(max_events).unwrap()
311        };
312
313        // TODO: support the signal mask
314        if sigmask.is_some() {
315            warn_once_then_trace!(
316                "Epoll pwait called with non-null sigmask, \
317                which is not yet supported by shadow; returning EINVAL"
318            );
319            return Err(Errno::EINVAL.into());
320        }
321
322        // Get the descriptor, or return early if it doesn't exist.
323        let desc_table = ctx.objs.thread.descriptor_table_borrow(ctx.objs.host);
324        let epoll = {
325            let desc = Self::get_descriptor(&desc_table, epfd)?;
326
327            let CompatFile::New(epoll) = desc.file() else {
328                return Err(Errno::EINVAL.into());
329            };
330
331            let File::Epoll(epoll) = epoll.inner_file() else {
332                return Err(Errno::EINVAL.into());
333            };
334
335            epoll
336        };
337
338        if epoll.borrow().has_ready_events() {
339            log::trace!("Epoll {epfd} has ready events");
340
341            // We must not return an error after collecting events from epoll, otherwise the epoll
342            // state will become inconsitent with the view of events from the managed process.
343            // Thus, we explicitly check that we have a valid location to return the events before
344            // we collect them from epoll.
345            if events_ptr.is_null() {
346                return Err(Errno::EFAULT.into());
347            }
348
349            // After we collect the events here, failing to write them out to the events_ptr
350            // ForeignPointer below will leave our event state inconsistent with the managed
351            // process's understanding of the available events.
352            let ready = CallbackQueue::queue_and_run_with_legacy(|cb_queue| {
353                epoll
354                    .borrow_mut()
355                    .collect_ready_events(cb_queue, max_events)
356            });
357            let n_ready = ready.len();
358            if n_ready > max_events as usize {
359                panic!("Epoll should not return more than {max_events} events");
360            }
361
362            // Write the events out to the managed process memory.
363            let mut mem = ctx.objs.process.memory_borrow_mut();
364            write_events_to_ptr(&mut mem, ready, events_ptr)?;
365
366            // Return the number of events we are reporting.
367            log::trace!("Epoll {epfd} returning {n_ready} events");
368            return Ok(n_ready.try_into().unwrap());
369        }
370
371        // Our behavior depends on the value of timeout.
372        // Return immediately if timeout is 0.
373        if let Some(timeout) = timeout {
374            if timeout.is_zero() {
375                log::trace!("No events are ready on epoll {epfd} and the timeout is 0");
376                return Ok(0);
377            }
378        }
379
380        // Return immediately if we were already blocked for a while and still have no events.
381        // Condition will only exist after a wakeup.
382        if let Some(cond) = ctx.objs.thread.syscall_condition() {
383            if let Some(abs_timeout) = cond.timeout() {
384                if Worker::current_time().unwrap() >= abs_timeout {
385                    log::trace!("No events are ready on epoll {epfd} and the timeout expired");
386                    return Ok(0);
387                }
388            }
389        }
390
391        // If there's a signal pending, this syscall will be interrupted.
392        if ctx.objs.thread.unblocked_signal_pending(
393            ctx.objs.process,
394            &ctx.objs.host.shim_shmem_lock_borrow().unwrap(),
395        ) {
396            return Err(SyscallError::new_interrupted(false));
397        }
398
399        // Convert timeout to an EmulatedTime.
400        let Ok(abs_timeout_opt) = timeout
401            .map(|x| Worker::current_time().unwrap().checked_add(x).ok_or(()))
402            .transpose()
403        else {
404            log::trace!("Epoll wait with invalid timeout {timeout:?} (too large)");
405            return Err(Errno::EINVAL.into());
406        };
407
408        log::trace!("No events are ready on epoll {epfd} and we need to block");
409
410        // Block on epoll state; an epoll descriptor is readable when it has events.
411        let mut rv = SyscallError::new_blocked_on_file(
412            File::Epoll(Arc::clone(epoll)),
413            FileState::READABLE,
414            /* restartable= */ false,
415        );
416
417        // Set timeout, if provided.
418        if abs_timeout_opt.is_some() {
419            rv.blocked_condition().unwrap().set_timeout(abs_timeout_opt);
420        }
421
422        Err(rv)
423    }
424}
425
426fn timeout_arg_to_maybe_simtime(
427    timeout_ms: std::ffi::c_int,
428) -> Result<Option<SimulationTime>, Errno> {
429    // epoll_wait(2): "Specifying a timeout of -1 causes epoll_wait() to block indefinitely"
430    let timeout_ms = (timeout_ms >= 0).then_some(timeout_ms);
431
432    if let Some(timeout_ms) = timeout_ms {
433        // a non-negative c_int should always convert to a u64
434        let timeout_ms = timeout_ms.try_into().unwrap();
435        let timeout = SimulationTime::try_from_millis(timeout_ms).ok_or(Errno::EINVAL)?;
436        Ok(Some(timeout))
437    } else {
438        Ok(None)
439    }
440}
441
442/// There is a maximum number of events that can be specified in Linux:
443/// https://github.com/torvalds/linux/blob/2cf0f715623872823a72e451243bbf555d10d032/fs/eventpoll.c#L2291
444///
445/// The maximum is defined as:
446///   `#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))`
447/// https://github.com/torvalds/linux/blob/2cf0f715623872823a72e451243bbf555d10d032/fs/eventpoll.c#L95
448///
449/// This function performs the above computation as Linux does.
450fn epoll_max_events_upper_bound() -> i32 {
451    let ep_max_events = i32::MAX;
452    let ep_ev_size: i32 = std::mem::size_of::<linux_api::epoll::epoll_event>()
453        .try_into()
454        .unwrap_or(i32::MAX);
455    ep_max_events.saturating_div(ep_ev_size)
456}
457
458fn write_events_to_ptr(
459    mem: &mut MemoryManager,
460    ready: Vec<(EpollEvents, u64)>,
461    events_ptr: ForeignPtr<linux_api::epoll::epoll_event>,
462) -> Result<(), Errno> {
463    let events_ptr = ForeignArrayPtr::new(events_ptr, ready.len());
464    let mut mem_ref = mem.memory_ref_mut(events_ptr)?;
465
466    for ((ev, data), plugin_ev) in ready.iter().zip(mem_ref.deref_mut().iter_mut()) {
467        plugin_ev.events = ev.bits();
468        plugin_ev.data = *data;
469    }
470
471    mem_ref.flush()?;
472
473    Ok(())
474}