shadow_shim/
clone.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
use linux_api::ldt::linux_user_desc;
use linux_api::sched::{CloneFlags, CloneResult};
use linux_api::signal::Signal;
use linux_api::ucontext::{sigcontext, ucontext};
use shadow_shim_helper_rs::shim_event::ShimEventAddThreadReq;
use shadow_shmem::allocator::ShMemBlockSerialized;

use crate::tls_allow_native_syscalls;

/// Used below to validate the offset of `field` from `base`.
/// TODO: replace with `core::ptr::offset_of` once stabilized.
/// https://github.com/rust-lang/rust/issues/106655
fn sigcontext_offset_of(base: &sigcontext, field: &u64) -> usize {
    let base = core::ptr::from_ref(base) as usize;
    let field = core::ptr::from_ref(field) as usize;
    field - base
}

/// Round `ptr` down to a value that has alignment `align`. Useful when
/// allocating on a stack that grows downward.
///
/// Panics if `align` isn't a power of 2.
///
/// # Safety
///
/// The resulting aligned pointer must be part of the same allocation as `ptr`.
/// e.g. the stack that `ptr` points into must have enough room remaining to do
/// the alignment.
unsafe fn align_down(ptr: *mut u8, align: usize) -> *mut u8 {
    assert!(align.is_power_of_two());
    // Mask off enough low-order bits to ensure proper alignment.
    let ptr = ptr as usize;
    let ptr = ptr & !(align - 1);
    ptr as *mut u8
}

/// Helper for `do_clone`. Restores all general purpose registers, stack pointer,
/// and instruction pointer from `ctx` except for `rax`, which is set to 0.
///
/// # Safety
///
/// `ctx` must be safe to restore.
///
/// This is difficult to characterize in a general sense, but e.g. minimally the
/// stack and instruction pointers must be valid, and other register values must
/// correspond to "sound" values of whatever state they correspond to at that
/// instruction pointer.
unsafe extern "C-unwind" fn set_context(ctx: &sigcontext) -> ! {
    // These offsets are hard-coded into the asm format string below.
    // TODO: turn these into const parameters to the asm block when const
    // asm parameters are stabilized.
    // https://github.com/rust-lang/rust/issues/93332
    debug_assert_eq!(sigcontext_offset_of(ctx, &ctx.r8), 0);
    debug_assert_eq!(sigcontext_offset_of(ctx, &ctx.r9), 0x8);
    debug_assert_eq!(sigcontext_offset_of(ctx, &ctx.r10), 0x10);
    debug_assert_eq!(sigcontext_offset_of(ctx, &ctx.r11), 0x18);
    debug_assert_eq!(sigcontext_offset_of(ctx, &ctx.r12), 0x20);
    debug_assert_eq!(sigcontext_offset_of(ctx, &ctx.r13), 0x28);
    debug_assert_eq!(sigcontext_offset_of(ctx, &ctx.r14), 0x30);
    debug_assert_eq!(sigcontext_offset_of(ctx, &ctx.r15), 0x38);
    debug_assert_eq!(sigcontext_offset_of(ctx, &ctx.rsi), 0x48);
    debug_assert_eq!(sigcontext_offset_of(ctx, &ctx.rdi), 0x40);
    debug_assert_eq!(sigcontext_offset_of(ctx, &ctx.rbx), 0x58);
    debug_assert_eq!(sigcontext_offset_of(ctx, &ctx.rdx), 0x60);
    debug_assert_eq!(sigcontext_offset_of(ctx, &ctx.rbp), 0x50);
    debug_assert_eq!(sigcontext_offset_of(ctx, &ctx.rip), 0x80);
    debug_assert_eq!(sigcontext_offset_of(ctx, &ctx.rsp), 0x78);

    unsafe {
        core::arch::asm!(
            // Restore general purpose registers.
            // Offsets are validated in assertions above.
            "mov r8, [rax+0x0]",
            "mov r9, [rax+0x8]",
            "mov r10, [rax+0x10]",
            "mov r11, [rax+0x18]",
            "mov r12, [rax+0x20]",
            "mov r13, [rax+0x28]",
            "mov r14, [rax+0x30]",
            "mov r15, [rax+0x38]",
            "mov rsi, [rax+0x48]",
            "mov rdi, [rax+0x40]",
            "mov rbx, [rax+0x58]",
            "mov rdx, [rax+0x60]",
            "mov rbp, [rax+0x50]",
            "mov rsp, [rax+0x78]",

            // Push `ctx`'s `rip` to stack
            "mov rax, [rax+0x80]",
            "push rax",

            // Not restored:
            // - `rax`: stores the result of the syscall, which we set below.
            // - Floating point and other special registers: hopefully not needed.

            // Set `rax` to 0
            "mov rax, 0",

            // Ret to ctx's `rip`
            "ret",
            in("rax") core::ptr::from_ref(ctx),
            options(noreturn)
        )
    };
}

/// `extern "C-unwind"` wrapper for `crate::tls_ipc::set`, which we can call from
/// assembly.
///
/// # Safety
///
/// `blk` must contained a serialized block of
/// type `IPCData`, which outlives the current thread.
unsafe extern "C-unwind" fn tls_ipc_set(blk: *const ShMemBlockSerialized) {
    let blk = unsafe { blk.as_ref().unwrap() };
    let prev = crate::tls_allow_native_syscalls::swap(true);

    // SAFETY: ensured by caller
    unsafe { crate::tls_ipc::set(blk) };

    crate::tls_allow_native_syscalls::swap(prev);
}

/// Execute a native `clone` syscall to create a new thread in a new process.
///
/// This function returns in both the parent and the child.
///
/// # Safety
///
/// * `ctx` must be dereferenceable, and must be safe for the newly spawned
///   child thread to restore.
/// * Other pointers, if non-null, must be safely dereferenceable.
/// * `child_stack` must be "sufficiently big" for the child thread to run on.
/// * `tls` if provided must point to correctly initialized thread local storage.
unsafe fn do_clone_process(ctx: &ucontext, event: &ShimEventAddThreadReq) -> i64 {
    let flags = CloneFlags::from_bits(event.flags).unwrap();
    assert!(!flags.contains(CloneFlags::CLONE_THREAD));
    let ptid: *mut i32 = event.ptid.cast::<i32>().into_raw_mut();
    let ctid: *mut i32 = event.ctid.cast::<i32>().into_raw_mut();
    let child_stack: *mut u8 = event.child_stack.cast::<u8>().into_raw_mut();
    let newtls = event.newtls;

    if flags.contains(CloneFlags::CLONE_VM) {
        // Don't know of a real-world need for this.
        unimplemented!("fork with shared memory");
    }
    if flags.contains(CloneFlags::CLONE_VFORK) {
        // We want to support this eventually, but will take some work.
        unimplemented!("vfork");
    }
    if flags.contains(CloneFlags::CLONE_SETTLS) {
        // In particular we don't correctly handle the case where the parent
        // thread is using `tls::Mode::Native`, but the child thread is
        // unable to.
        //
        // We could try to detect that more specific case and/or correctly
        // handle it, but I don't think this is likely to be needed.
        unimplemented!("CLONE_SETTLS without CLONE_THREAD");
    }

    // The shadow Process should be the parent; not this process.
    assert!(flags.contains(CloneFlags::CLONE_PARENT));

    let parent_tls_key = crate::SHIM_TLS.current_key();

    let res = match unsafe {
        linux_api::sched::clone(
            flags,
            Some(Signal::SIGCHLD),
            // If a child stack is provided, we do the stack switch below
            // as part of initialization instead of having the syscall do it for us.
            // It's a bit simpler this way, and we can safely do it this way
            // since we're not using CLONE_VM (not sharing memory with the
            // parent).
            core::ptr::null_mut(),
            ptid,
            ctid,
            newtls as *mut linux_user_desc,
        )
    } {
        Ok(r) => r,
        Err(e) => return e.to_negated_i64(),
    };
    match res {
        CloneResult::CallerIsChild => {
            // SAFETY: We have exclusive access to SHIM_TLS: this is the only thread
            // in the new process, and we're not sharing memory with the parent process.
            unsafe { crate::SHIM_TLS.fork_from(parent_tls_key) };
            // SAFETY: Shadow should give us the correct type and lifetime.
            unsafe { crate::tls_ipc::set(&event.ipc_block) };
            unsafe { crate::bindings::_shim_child_process_init_preload() };
            if !child_stack.is_null() {
                // Do the requested stack switch by long jumping out of the
                // signal handler to an updated context.
                tls_allow_native_syscalls::swap(false);
                let mut mctx = ctx.uc_mcontext;
                mctx.rsp = child_stack as u64;
                unsafe { set_context(&mctx) };
            }
            0
        }
        CloneResult::CallerIsParent(child) => child.as_raw_nonzero().get().into(),
    }
}

/// Execute a native `clone` syscall to create a new thread.  The newly created
/// child thread will resume execution from `ctx`, which should be the point
/// where the managed code originally made a `clone` syscall (but was
/// intercepted by seccomp).
///
/// # Safety
///
/// * `ctx` must be dereferenceable, and must be safe for the newly spawned
///   child thread to restore.
/// * Other pointers, if non-null, must be safely dereferenceable.
/// * `child_stack` must be "sufficiently big" for the child thread to run on.
/// * `tls` if provided must point to correctly initialized thread local storage.
unsafe fn do_clone_thread(ctx: &ucontext, event: &ShimEventAddThreadReq) -> i64 {
    let flags = CloneFlags::from_bits(event.flags).unwrap();
    assert!(flags.contains(CloneFlags::CLONE_THREAD));
    let ptid: *mut i32 = event.ptid.cast::<i32>().into_raw_mut();
    let ctid: *mut i32 = event.ctid.cast::<i32>().into_raw_mut();
    let child_stack: *mut u8 = event.child_stack.cast::<u8>().into_raw_mut();
    let newtls = event.newtls;

    assert!(
        !child_stack.is_null(),
        "clone without a new stack not implemented"
    );

    // x86-64 calling conventions require a 16-byte aligned stack
    assert_eq!(
        child_stack.align_offset(16),
        0,
        "clone with unaligned new stack not implemented"
    );

    // Copy ctx to top of the child stack.
    // SAFETY: Should still point within stack, assuming it fits.
    let child_current_rsp = unsafe { child_stack.sub(core::mem::size_of::<sigcontext>()) };
    let child_current_rsp =
        unsafe { align_down(child_current_rsp, core::mem::align_of::<sigcontext>()) };
    let child_sigcontext = child_current_rsp.cast::<sigcontext>();
    unsafe { core::ptr::write(child_sigcontext, ctx.uc_mcontext) };

    // Update child's copy of context to use the child's stack.
    let child_sigctx = unsafe { child_sigcontext.as_mut().unwrap() };
    child_sigctx.rsp = child_stack as u64;

    // Copy child's IPC block to child's stack
    let child_current_rsp =
        unsafe { child_current_rsp.sub(core::mem::size_of::<ShMemBlockSerialized>()) };
    let child_current_rsp = unsafe {
        align_down(
            child_current_rsp,
            core::mem::align_of::<ShMemBlockSerialized>(),
        )
    };
    let child_ipc_blk = child_current_rsp.cast::<ShMemBlockSerialized>();
    unsafe { core::ptr::write(child_ipc_blk, event.ipc_block) };

    // Ensure stack is 16-aligned so that we can safely make function calls.
    let child_current_rsp = unsafe { align_down(child_current_rsp, 16) };

    let rv: i64;
    // SAFETY:
    //
    // This block makes the clone syscall, which is tricky because Rust currently
    // doesn't have a way to tell the compiler that a block or function "returns twice".
    // <https://github.com/rust-lang/libc/issues/1596>
    //
    // We work around this by using a single asm block to:
    // * Make the `clone` syscall
    // * Do the required per-thread shim initialization
    // * Restore CPU state and *jump* to the point where the managed code was
    // originally trying to make the syscall.
    //
    // The point we jump to should already be a point that was expecting to make
    // the clone syscall, so should already correctly handle that both the
    // parent and child thread resume execution there. (The parent thread
    // resumes execution there after returning from the seccomp signal handler
    // normally).
    unsafe {
        core::arch::asm!(
            // Make the clone syscall
            "syscall",
            // If in the parent, exit the asm block (by jumping forward to the label
            // `2`). https://doc.rust-lang.org/rust-by-example/unsafe/asm.html#labels
            "cmp rax, 0",
            "jne 2f",

            // Initialize the IPC block for this thread
            "mov rdi, {blk}",
            "call {tls_ipc_set}",

            // Initialize state for this thread
            "call {shim_init_thread}",

            // Set CPU state from ctx
            "mov rdi, r12",
            "call {set_context}",

            "2:",
            // clone syscall number in, rv out
            inout("rax") libc::SYS_clone => rv,
            // clone syscall arg1
            in("rdi") flags.bits(),
            // clone syscall arg2
            in("rsi") child_current_rsp,
            // clone syscall arg3
            in("rdx") ptid,
            // clone syscall arg4
            in("r10") ctid,
            // clone syscall arg5
            in("r8") newtls,
            blk = in(reg) child_ipc_blk,
            tls_ipc_set = sym tls_ipc_set,
            shim_init_thread = sym crate::init_thread,
            // callee-saved register
            in("r12") child_sigcontext as * const _,
            set_context = sym set_context,
        )
    }
    rv
}

/// Execute a native `clone` syscall, creating a new thread, which may be in
/// a new process (depending whether CLONE_THREAD is set).
///
/// If CLONE_THREAD is set, then the newly created child thread will resume
/// execution from `ctx`, which should be the point where the managed code
/// originally made a `clone` syscall (but was intercepted by seccomp).
/// Otherwise this function will return normally in both the parent and child
/// processes.
///
/// # Safety
///
/// * `ctx` must be dereferenceable, and must be safe for the newly spawned
///   child thread to restore.
/// * Other pointers, if non-null, must be safely dereferenceable.
/// * `child_stack` must be "sufficiently big" for the child thread to run on.
/// * `tls` if provided must point to correctly initialized thread local storage.
pub unsafe fn do_clone(ctx: &ucontext, event: &ShimEventAddThreadReq) -> i64 {
    let flags = CloneFlags::from_bits(event.flags).unwrap();

    if flags.contains(CloneFlags::CLONE_THREAD) {
        unsafe { do_clone_thread(ctx, event) }
    } else {
        unsafe { do_clone_process(ctx, event) }
    }
}