rustix/backend/linux_raw/
vdso_wrappers.rs

1//! Implement syscalls using the vDSO.
2//!
3//! <https://man7.org/linux/man-pages/man7/vdso.7.html>
4//!
5//! # Safety
6//!
7//! Similar to syscalls.rs, this file performs raw system calls, and sometimes
8//! passes them uninitialized memory buffers. This file also calls vDSO
9//! functions.
10#![allow(unsafe_code)]
11#![allow(clippy::missing_transmute_annotations)]
12
13#[cfg(target_arch = "x86")]
14use super::reg::{ArgReg, RetReg, SyscallNumber, A0, A1, A2, A3, A4, A5, R0};
15use super::vdso;
16#[cfg(target_arch = "x86")]
17use core::arch::global_asm;
18#[cfg(feature = "process")]
19#[cfg(any(
20    target_arch = "x86_64",
21    target_arch = "x86",
22    target_arch = "riscv64",
23    target_arch = "powerpc64"
24))]
25use core::ffi::c_void;
26use core::mem::transmute;
27use core::ptr::null_mut;
28use core::sync::atomic::AtomicPtr;
29use core::sync::atomic::Ordering::Relaxed;
30#[cfg(target_pointer_width = "32")]
31#[cfg(feature = "time")]
32use linux_raw_sys::general::timespec as __kernel_old_timespec;
33#[cfg(any(
34    all(
35        feature = "process",
36        any(
37            target_arch = "x86_64",
38            target_arch = "x86",
39            target_arch = "riscv64",
40            target_arch = "powerpc64"
41        )
42    ),
43    feature = "time"
44))]
45use {super::c, super::conv::ret, core::mem::MaybeUninit};
46#[cfg(feature = "time")]
47use {
48    super::conv::c_int,
49    crate::clockid::{ClockId, DynamicClockId},
50    crate::io,
51    crate::timespec::Timespec,
52    linux_raw_sys::general::{__kernel_clockid_t, __kernel_timespec},
53};
54
55#[cfg(feature = "time")]
56#[inline]
57pub(crate) fn clock_gettime(which_clock: ClockId) -> __kernel_timespec {
58    // SAFETY: `CLOCK_GETTIME` contains either null or the address of a
59    // function with an ABI like libc `clock_gettime`, and calling it has the
60    // side effect of writing to the result buffer, and no others.
61    unsafe {
62        let mut result = MaybeUninit::<__kernel_timespec>::uninit();
63        let callee = match transmute(CLOCK_GETTIME.load(Relaxed)) {
64            Some(callee) => callee,
65            None => init_clock_gettime(),
66        };
67        let r0 = callee(which_clock as c::c_int, result.as_mut_ptr());
68        // The `ClockId` enum only contains clocks which never fail. It may be
69        // tempting to change this to `debug_assert_eq`, however they can still
70        // fail on uncommon kernel configs, so we leave this in place to ensure
71        // that we don't execute undefined behavior if they ever do fail.
72        assert_eq!(r0, 0);
73        result.assume_init()
74    }
75}
76
77#[cfg(feature = "time")]
78#[inline]
79pub(crate) fn clock_gettime_dynamic(which_clock: DynamicClockId<'_>) -> io::Result<Timespec> {
80    let id = match which_clock {
81        DynamicClockId::Known(id) => id as __kernel_clockid_t,
82
83        DynamicClockId::Dynamic(fd) => {
84            // See `FD_TO_CLOCKID` in Linux's `clock_gettime` documentation.
85            use crate::backend::fd::AsRawFd;
86            const CLOCKFD: i32 = 3;
87            ((!fd.as_raw_fd() << 3) | CLOCKFD) as __kernel_clockid_t
88        }
89
90        DynamicClockId::RealtimeAlarm => c::CLOCK_REALTIME_ALARM as __kernel_clockid_t,
91        DynamicClockId::Tai => c::CLOCK_TAI as __kernel_clockid_t,
92        DynamicClockId::Boottime => c::CLOCK_BOOTTIME as __kernel_clockid_t,
93        DynamicClockId::BoottimeAlarm => c::CLOCK_BOOTTIME_ALARM as __kernel_clockid_t,
94    };
95
96    // SAFETY: `CLOCK_GETTIME` contains either null or the address of a
97    // function with an ABI like libc `clock_gettime`, and calling it has the
98    // side effect of writing to the result buffer, and no others.
99    unsafe {
100        const EINVAL: c::c_int = -(c::EINVAL as c::c_int);
101        let mut timespec = MaybeUninit::<Timespec>::uninit();
102        let callee = match transmute(CLOCK_GETTIME.load(Relaxed)) {
103            Some(callee) => callee,
104            None => init_clock_gettime(),
105        };
106        match callee(id, timespec.as_mut_ptr()) {
107            0 => (),
108            EINVAL => return Err(io::Errno::INVAL),
109            _ => _rustix_clock_gettime_via_syscall(id, timespec.as_mut_ptr())?,
110        }
111        Ok(timespec.assume_init())
112    }
113}
114
115#[cfg(feature = "process")]
116#[cfg(any(
117    target_arch = "x86_64",
118    target_arch = "x86",
119    target_arch = "riscv64",
120    target_arch = "powerpc64"
121))]
122#[inline]
123pub(crate) fn sched_getcpu() -> usize {
124    // SAFETY: `GETCPU` contains either null or the address of a function with
125    // an ABI like libc `getcpu`, and calling it has the side effect of writing
126    // to the result buffers, and no others.
127    unsafe {
128        let mut cpu = MaybeUninit::<u32>::uninit();
129        let callee = match transmute(GETCPU.load(Relaxed)) {
130            Some(callee) => callee,
131            None => init_getcpu(),
132        };
133        let r0 = callee(cpu.as_mut_ptr(), null_mut(), null_mut());
134        debug_assert_eq!(r0, 0);
135        cpu.assume_init() as usize
136    }
137}
138
139#[cfg(target_arch = "x86")]
140pub(super) mod x86_via_vdso {
141    use super::{transmute, ArgReg, Relaxed, RetReg, SyscallNumber, A0, A1, A2, A3, A4, A5, R0};
142    use crate::backend::arch::asm;
143
144    #[inline]
145    pub(in crate::backend) unsafe fn syscall0(nr: SyscallNumber<'_>) -> RetReg<R0> {
146        let callee = match transmute(super::SYSCALL.load(Relaxed)) {
147            Some(callee) => callee,
148            None => super::init_syscall(),
149        };
150        asm::indirect_syscall0(callee, nr)
151    }
152
153    #[inline]
154    pub(in crate::backend) unsafe fn syscall1<'a>(
155        nr: SyscallNumber<'a>,
156        a0: ArgReg<'a, A0>,
157    ) -> RetReg<R0> {
158        let callee = match transmute(super::SYSCALL.load(Relaxed)) {
159            Some(callee) => callee,
160            None => super::init_syscall(),
161        };
162        asm::indirect_syscall1(callee, nr, a0)
163    }
164
165    #[inline]
166    pub(in crate::backend) unsafe fn syscall1_noreturn<'a>(
167        nr: SyscallNumber<'a>,
168        a0: ArgReg<'a, A0>,
169    ) -> ! {
170        let callee = match transmute(super::SYSCALL.load(Relaxed)) {
171            Some(callee) => callee,
172            None => super::init_syscall(),
173        };
174        asm::indirect_syscall1_noreturn(callee, nr, a0)
175    }
176
177    #[inline]
178    pub(in crate::backend) unsafe fn syscall2<'a>(
179        nr: SyscallNumber<'a>,
180        a0: ArgReg<'a, A0>,
181        a1: ArgReg<'a, A1>,
182    ) -> RetReg<R0> {
183        let callee = match transmute(super::SYSCALL.load(Relaxed)) {
184            Some(callee) => callee,
185            None => super::init_syscall(),
186        };
187        asm::indirect_syscall2(callee, nr, a0, a1)
188    }
189
190    #[inline]
191    pub(in crate::backend) unsafe fn syscall3<'a>(
192        nr: SyscallNumber<'a>,
193        a0: ArgReg<'a, A0>,
194        a1: ArgReg<'a, A1>,
195        a2: ArgReg<'a, A2>,
196    ) -> RetReg<R0> {
197        let callee = match transmute(super::SYSCALL.load(Relaxed)) {
198            Some(callee) => callee,
199            None => super::init_syscall(),
200        };
201        asm::indirect_syscall3(callee, nr, a0, a1, a2)
202    }
203
204    #[inline]
205    pub(in crate::backend) unsafe fn syscall4<'a>(
206        nr: SyscallNumber<'a>,
207        a0: ArgReg<'a, A0>,
208        a1: ArgReg<'a, A1>,
209        a2: ArgReg<'a, A2>,
210        a3: ArgReg<'a, A3>,
211    ) -> RetReg<R0> {
212        let callee = match transmute(super::SYSCALL.load(Relaxed)) {
213            Some(callee) => callee,
214            None => super::init_syscall(),
215        };
216        asm::indirect_syscall4(callee, nr, a0, a1, a2, a3)
217    }
218
219    #[inline]
220    pub(in crate::backend) unsafe fn syscall5<'a>(
221        nr: SyscallNumber<'a>,
222        a0: ArgReg<'a, A0>,
223        a1: ArgReg<'a, A1>,
224        a2: ArgReg<'a, A2>,
225        a3: ArgReg<'a, A3>,
226        a4: ArgReg<'a, A4>,
227    ) -> RetReg<R0> {
228        let callee = match transmute(super::SYSCALL.load(Relaxed)) {
229            Some(callee) => callee,
230            None => super::init_syscall(),
231        };
232        asm::indirect_syscall5(callee, nr, a0, a1, a2, a3, a4)
233    }
234
235    #[inline]
236    pub(in crate::backend) unsafe fn syscall6<'a>(
237        nr: SyscallNumber<'a>,
238        a0: ArgReg<'a, A0>,
239        a1: ArgReg<'a, A1>,
240        a2: ArgReg<'a, A2>,
241        a3: ArgReg<'a, A3>,
242        a4: ArgReg<'a, A4>,
243        a5: ArgReg<'a, A5>,
244    ) -> RetReg<R0> {
245        let callee = match transmute(super::SYSCALL.load(Relaxed)) {
246            Some(callee) => callee,
247            None => super::init_syscall(),
248        };
249        asm::indirect_syscall6(callee, nr, a0, a1, a2, a3, a4, a5)
250    }
251
252    // With the indirect call, it isn't meaningful to do a separate
253    // `_readonly` optimization.
254    #[allow(unused_imports)]
255    pub(in crate::backend) use {
256        syscall0 as syscall0_readonly, syscall1 as syscall1_readonly,
257        syscall2 as syscall2_readonly, syscall3 as syscall3_readonly,
258        syscall4 as syscall4_readonly, syscall5 as syscall5_readonly,
259        syscall6 as syscall6_readonly,
260    };
261}
262
263#[cfg(feature = "time")]
264type ClockGettimeType = unsafe extern "C" fn(c::c_int, *mut Timespec) -> c::c_int;
265
266#[cfg(feature = "process")]
267#[cfg(any(
268    target_arch = "x86_64",
269    target_arch = "x86",
270    target_arch = "riscv64",
271    target_arch = "powerpc64"
272))]
273type GetcpuType = unsafe extern "C" fn(*mut u32, *mut u32, *mut c_void) -> c::c_int;
274
275/// The underlying syscall functions are only called from asm, using the
276/// special syscall calling convention to pass arguments and return values,
277/// which the signature here doesn't reflect.
278#[cfg(target_arch = "x86")]
279pub(super) type SyscallType = unsafe extern "C" fn();
280
281/// Initialize `CLOCK_GETTIME` and return its value.
282#[cfg(feature = "time")]
283#[cold]
284fn init_clock_gettime() -> ClockGettimeType {
285    init();
286    // SAFETY: Load the function address from static storage that we just
287    // initialized.
288    unsafe { transmute(CLOCK_GETTIME.load(Relaxed)) }
289}
290
291/// Initialize `GETCPU` and return its value.
292#[cfg(feature = "process")]
293#[cfg(any(
294    target_arch = "x86_64",
295    target_arch = "x86",
296    target_arch = "riscv64",
297    target_arch = "powerpc64"
298))]
299#[cold]
300fn init_getcpu() -> GetcpuType {
301    init();
302    // SAFETY: Load the function address from static storage that we just
303    // initialized.
304    unsafe { transmute(GETCPU.load(Relaxed)) }
305}
306
307/// Initialize `SYSCALL` and return its value.
308#[cfg(target_arch = "x86")]
309#[cold]
310fn init_syscall() -> SyscallType {
311    init();
312    // SAFETY: Load the function address from static storage that we just
313    // initialized.
314    unsafe { transmute(SYSCALL.load(Relaxed)) }
315}
316
317/// `AtomicPtr` can't hold a `fn` pointer, so we use a `*` pointer to this
318/// placeholder type, and cast it as needed.
319struct Function;
320#[cfg(feature = "time")]
321static CLOCK_GETTIME: AtomicPtr<Function> = AtomicPtr::new(null_mut());
322#[cfg(feature = "process")]
323#[cfg(any(
324    target_arch = "x86_64",
325    target_arch = "x86",
326    target_arch = "riscv64",
327    target_arch = "powerpc64"
328))]
329static GETCPU: AtomicPtr<Function> = AtomicPtr::new(null_mut());
330#[cfg(target_arch = "x86")]
331static SYSCALL: AtomicPtr<Function> = AtomicPtr::new(null_mut());
332
333#[cfg(feature = "time")]
334unsafe extern "C" fn rustix_clock_gettime_via_syscall(
335    clockid: c::c_int,
336    res: *mut Timespec,
337) -> c::c_int {
338    match _rustix_clock_gettime_via_syscall(clockid, res) {
339        Ok(()) => 0,
340        Err(err) => err.raw_os_error().wrapping_neg(),
341    }
342}
343
344#[cfg(feature = "time")]
345#[cfg(target_pointer_width = "32")]
346unsafe fn _rustix_clock_gettime_via_syscall(
347    clockid: c::c_int,
348    res: *mut Timespec,
349) -> io::Result<()> {
350    let r0 = syscall!(__NR_clock_gettime64, c_int(clockid), res);
351    match ret(r0) {
352        Err(io::Errno::NOSYS) => _rustix_clock_gettime_via_syscall_old(clockid, res),
353        otherwise => otherwise,
354    }
355}
356
357#[cfg(feature = "time")]
358#[cfg(target_pointer_width = "32")]
359unsafe fn _rustix_clock_gettime_via_syscall_old(
360    clockid: c::c_int,
361    res: *mut Timespec,
362) -> io::Result<()> {
363    // Ordinarily `rustix` doesn't like to emulate system calls, but in the
364    // case of time APIs, it's specific to Linux, specific to 32-bit
365    // architectures *and* specific to old kernel versions, and it's not that
366    // hard to fix up here, so that no other code needs to worry about this.
367    let mut old_result = MaybeUninit::<__kernel_old_timespec>::uninit();
368    let r0 = syscall!(__NR_clock_gettime, c_int(clockid), &mut old_result);
369    match ret(r0) {
370        Ok(()) => {
371            let old_result = old_result.assume_init();
372            *res = Timespec {
373                tv_sec: old_result.tv_sec.into(),
374                tv_nsec: old_result.tv_nsec.into(),
375            };
376            Ok(())
377        }
378        otherwise => otherwise,
379    }
380}
381
382#[cfg(feature = "time")]
383#[cfg(target_pointer_width = "64")]
384unsafe fn _rustix_clock_gettime_via_syscall(
385    clockid: c::c_int,
386    res: *mut Timespec,
387) -> io::Result<()> {
388    ret(syscall!(__NR_clock_gettime, c_int(clockid), res))
389}
390
391#[cfg(feature = "process")]
392#[cfg(any(
393    target_arch = "x86_64",
394    target_arch = "x86",
395    target_arch = "riscv64",
396    target_arch = "powerpc64"
397))]
398unsafe extern "C" fn rustix_getcpu_via_syscall(
399    cpu: *mut u32,
400    node: *mut u32,
401    unused: *mut c_void,
402) -> c::c_int {
403    match ret(syscall!(__NR_getcpu, cpu, node, unused)) {
404        Ok(()) => 0,
405        Err(err) => err.raw_os_error().wrapping_neg(),
406    }
407}
408
409#[cfg(target_arch = "x86")]
410extern "C" {
411    /// A symbol pointing to an `int 0x80` instruction. This “function” is only
412    /// called from assembly, and only with the x86 syscall calling convention,
413    /// so its signature here is not its true signature.
414    ///
415    /// This extern block and the `global_asm!` below can be replaced with
416    /// `#[naked]` if it's stabilized.
417    fn rustix_int_0x80();
418}
419
420#[cfg(target_arch = "x86")]
421global_asm!(
422    r#"
423    .section    .text.rustix_int_0x80,"ax",@progbits
424    .p2align    4
425    .weak       rustix_int_0x80
426    .hidden     rustix_int_0x80
427    .type       rustix_int_0x80, @function
428rustix_int_0x80:
429    .cfi_startproc
430    int    0x80
431    ret
432    .cfi_endproc
433    .size rustix_int_0x80, .-rustix_int_0x80
434"#
435);
436
437fn minimal_init() {
438    // Store default function addresses in static storage so that if we
439    // end up making any system calls while we read the vDSO, they'll work. If
440    // the memory happens to already be initialized, this is redundant, but not
441    // harmful.
442    #[cfg(feature = "time")]
443    {
444        CLOCK_GETTIME
445            .compare_exchange(
446                null_mut(),
447                rustix_clock_gettime_via_syscall as *mut Function,
448                Relaxed,
449                Relaxed,
450            )
451            .ok();
452    }
453
454    #[cfg(feature = "process")]
455    #[cfg(any(
456        target_arch = "x86_64",
457        target_arch = "x86",
458        target_arch = "riscv64",
459        target_arch = "powerpc64"
460    ))]
461    {
462        GETCPU
463            .compare_exchange(
464                null_mut(),
465                rustix_getcpu_via_syscall as *mut Function,
466                Relaxed,
467                Relaxed,
468            )
469            .ok();
470    }
471
472    #[cfg(target_arch = "x86")]
473    {
474        SYSCALL
475            .compare_exchange(
476                null_mut(),
477                rustix_int_0x80 as *mut Function,
478                Relaxed,
479                Relaxed,
480            )
481            .ok();
482    }
483}
484
485fn init() {
486    minimal_init();
487
488    if let Some(vdso) = vdso::Vdso::new() {
489        #[cfg(feature = "time")]
490        {
491            // Look up the platform-specific `clock_gettime` symbol as
492            // documented [here], except on 32-bit platforms where we look up
493            // the `64`-suffixed variant and fail if we don't find it.
494            //
495            // [here]: https://man7.org/linux/man-pages/man7/vdso.7.html
496            #[cfg(target_arch = "x86_64")]
497            let ptr = vdso.sym(cstr!("LINUX_2.6"), cstr!("__vdso_clock_gettime"));
498            #[cfg(target_arch = "arm")]
499            let ptr = vdso.sym(cstr!("LINUX_2.6"), cstr!("__vdso_clock_gettime64"));
500            #[cfg(target_arch = "aarch64")]
501            let ptr = vdso.sym(cstr!("LINUX_2.6.39"), cstr!("__kernel_clock_gettime"));
502            #[cfg(target_arch = "x86")]
503            let ptr = vdso.sym(cstr!("LINUX_2.6"), cstr!("__vdso_clock_gettime64"));
504            #[cfg(target_arch = "riscv64")]
505            let ptr = vdso.sym(cstr!("LINUX_4.15"), cstr!("__vdso_clock_gettime"));
506            #[cfg(target_arch = "powerpc64")]
507            let ptr = vdso.sym(cstr!("LINUX_2.6.15"), cstr!("__kernel_clock_gettime"));
508            #[cfg(target_arch = "s390x")]
509            let ptr = vdso.sym(cstr!("LINUX_2.6.29"), cstr!("__kernel_clock_gettime"));
510            #[cfg(any(target_arch = "mips", target_arch = "mips32r6"))]
511            let ptr = vdso.sym(cstr!("LINUX_2.6"), cstr!("__vdso_clock_gettime64"));
512            #[cfg(any(target_arch = "mips64", target_arch = "mips64r6"))]
513            let ptr = vdso.sym(cstr!("LINUX_2.6"), cstr!("__vdso_clock_gettime"));
514
515            // On all 64-bit platforms, the 64-bit `clock_gettime` symbols are
516            // always available.
517            #[cfg(target_pointer_width = "64")]
518            let ok = true;
519
520            // On some 32-bit platforms, the 64-bit `clock_gettime` symbols are
521            // not available on older kernel versions.
522            #[cfg(any(
523                target_arch = "arm",
524                target_arch = "mips",
525                target_arch = "mips32r6",
526                target_arch = "x86"
527            ))]
528            let ok = !ptr.is_null();
529
530            if ok {
531                assert!(!ptr.is_null());
532
533                // Store the computed function addresses in static storage so
534                // that we don't need to compute them again (but if we do, it
535                // doesn't hurt anything).
536                CLOCK_GETTIME.store(ptr.cast(), Relaxed);
537            }
538        }
539
540        #[cfg(feature = "process")]
541        #[cfg(any(
542            target_arch = "x86_64",
543            target_arch = "x86",
544            target_arch = "riscv64",
545            target_arch = "powerpc64"
546        ))]
547        {
548            // Look up the platform-specific `getcpu` symbol as documented
549            // [here].
550            //
551            // [here]: https://man7.org/linux/man-pages/man7/vdso.7.html
552            #[cfg(target_arch = "x86_64")]
553            let ptr = vdso.sym(cstr!("LINUX_2.6"), cstr!("__vdso_getcpu"));
554            #[cfg(target_arch = "x86")]
555            let ptr = vdso.sym(cstr!("LINUX_2.6"), cstr!("__vdso_getcpu"));
556            #[cfg(target_arch = "riscv64")]
557            let ptr = vdso.sym(cstr!("LINUX_4.15"), cstr!("__vdso_getcpu"));
558            #[cfg(target_arch = "powerpc64")]
559            let ptr = vdso.sym(cstr!("LINUX_2.6.15"), cstr!("__kernel_getcpu"));
560
561            #[cfg(any(
562                target_arch = "x86_64",
563                target_arch = "riscv64",
564                target_arch = "powerpc64"
565            ))]
566            let ok = true;
567
568            // On 32-bit x86, the symbol doesn't appear present sometimes.
569            #[cfg(target_arch = "x86")]
570            let ok = !ptr.is_null();
571
572            #[cfg(any(
573                target_arch = "aarch64",
574                target_arch = "arm",
575                target_arch = "mips",
576                target_arch = "mips32r6",
577                target_arch = "mips64",
578                target_arch = "mips64r6",
579                target_arch = "s390x",
580            ))]
581            let ok = false;
582
583            if ok {
584                assert!(!ptr.is_null());
585
586                // Store the computed function addresses in static storage so
587                // that we don't need to compute them again (but if we do, it
588                // doesn't hurt anything).
589                GETCPU.store(ptr.cast(), Relaxed);
590            }
591        }
592
593        // On x86, also look up the vsyscall entry point.
594        #[cfg(target_arch = "x86")]
595        {
596            let ptr = vdso.sym(cstr!("LINUX_2.5"), cstr!("__kernel_vsyscall"));
597            assert!(!ptr.is_null());
598
599            // As above, store the computed function addresses in
600            // static storage.
601            SYSCALL.store(ptr.cast(), Relaxed);
602        }
603    }
604}