shadow_shim/
clone.rs

1use linux_api::ldt::linux_user_desc;
2use linux_api::sched::{CloneFlags, CloneResult};
3use linux_api::signal::Signal;
4use linux_api::ucontext::{sigcontext, ucontext};
5use shadow_shim_helper_rs::shim_event::ShimEventAddThreadReq;
6use shadow_shmem::allocator::ShMemBlockSerialized;
7
8use crate::ExecutionContext;
9
10/// Used below to validate the offset of `field` from `base`.
11/// TODO: replace with `core::ptr::offset_of` once stabilized.
12/// https://github.com/rust-lang/rust/issues/106655
13fn sigcontext_offset_of(base: &sigcontext, field: &u64) -> usize {
14    let base = core::ptr::from_ref(base) as usize;
15    let field = core::ptr::from_ref(field) as usize;
16    field - base
17}
18
19/// Round `ptr` down to a value that has alignment `align`. Useful when
20/// allocating on a stack that grows downward.
21///
22/// Panics if `align` isn't a power of 2.
23///
24/// # Safety
25///
26/// The resulting aligned pointer must be part of the same allocation as `ptr`.
27/// e.g. the stack that `ptr` points into must have enough room remaining to do
28/// the alignment.
29unsafe fn align_down(ptr: *mut u8, align: usize) -> *mut u8 {
30    assert!(align.is_power_of_two());
31    // Mask off enough low-order bits to ensure proper alignment.
32    let ptr = ptr as usize;
33    let ptr = ptr & !(align - 1);
34    ptr as *mut u8
35}
36
37/// Helper for `do_clone`. Restores all general purpose registers, stack pointer,
38/// and instruction pointer from `ctx` except for `rax`, which is set to 0.
39///
40/// # Safety
41///
42/// `ctx` must be safe to restore.
43///
44/// This is difficult to characterize in a general sense, but e.g. minimally the
45/// stack and instruction pointers must be valid, and other register values must
46/// correspond to "sound" values of whatever state they correspond to at that
47/// instruction pointer.
48unsafe extern "C-unwind" fn set_context(ctx: &sigcontext) -> ! {
49    // These offsets are hard-coded into the asm format string below.
50    // TODO: turn these into const parameters to the asm block when const
51    // asm parameters are stabilized.
52    // https://github.com/rust-lang/rust/issues/93332
53    debug_assert_eq!(sigcontext_offset_of(ctx, &ctx.r8), 0);
54    debug_assert_eq!(sigcontext_offset_of(ctx, &ctx.r9), 0x8);
55    debug_assert_eq!(sigcontext_offset_of(ctx, &ctx.r10), 0x10);
56    debug_assert_eq!(sigcontext_offset_of(ctx, &ctx.r11), 0x18);
57    debug_assert_eq!(sigcontext_offset_of(ctx, &ctx.r12), 0x20);
58    debug_assert_eq!(sigcontext_offset_of(ctx, &ctx.r13), 0x28);
59    debug_assert_eq!(sigcontext_offset_of(ctx, &ctx.r14), 0x30);
60    debug_assert_eq!(sigcontext_offset_of(ctx, &ctx.r15), 0x38);
61    debug_assert_eq!(sigcontext_offset_of(ctx, &ctx.rsi), 0x48);
62    debug_assert_eq!(sigcontext_offset_of(ctx, &ctx.rdi), 0x40);
63    debug_assert_eq!(sigcontext_offset_of(ctx, &ctx.rbx), 0x58);
64    debug_assert_eq!(sigcontext_offset_of(ctx, &ctx.rdx), 0x60);
65    debug_assert_eq!(sigcontext_offset_of(ctx, &ctx.rbp), 0x50);
66    debug_assert_eq!(sigcontext_offset_of(ctx, &ctx.rip), 0x80);
67    debug_assert_eq!(sigcontext_offset_of(ctx, &ctx.rsp), 0x78);
68
69    unsafe {
70        core::arch::asm!(
71            // Restore general purpose registers.
72            // Offsets are validated in assertions above.
73            "mov r8, [rax+0x0]",
74            "mov r9, [rax+0x8]",
75            "mov r10, [rax+0x10]",
76            "mov r11, [rax+0x18]",
77            "mov r12, [rax+0x20]",
78            "mov r13, [rax+0x28]",
79            "mov r14, [rax+0x30]",
80            "mov r15, [rax+0x38]",
81            "mov rsi, [rax+0x48]",
82            "mov rdi, [rax+0x40]",
83            "mov rbx, [rax+0x58]",
84            "mov rdx, [rax+0x60]",
85            "mov rbp, [rax+0x50]",
86            "mov rsp, [rax+0x78]",
87
88            // Push `ctx`'s `rip` to stack
89            "mov rax, [rax+0x80]",
90            "push rax",
91
92            // Not restored:
93            // - `rax`: stores the result of the syscall, which we set below.
94            // - Floating point and other special registers: hopefully not needed.
95
96            // Set `rax` to 0
97            "mov rax, 0",
98
99            // Ret to ctx's `rip`
100            "ret",
101            in("rax") core::ptr::from_ref(ctx),
102            options(noreturn)
103        )
104    };
105}
106
107/// `extern "C-unwind"` wrapper for `crate::tls_ipc::set`, which we can call from
108/// assembly.
109///
110/// # Safety
111///
112/// `blk` must contained a serialized block of
113/// type `IPCData`, which outlives the current thread.
114unsafe extern "C-unwind" fn tls_ipc_set(blk: *const ShMemBlockSerialized) {
115    debug_assert_eq!(ExecutionContext::current(), ExecutionContext::Shadow);
116
117    let blk = unsafe { blk.as_ref().unwrap() };
118
119    // SAFETY: ensured by caller
120    unsafe { crate::tls_ipc::set(blk) };
121}
122
123/// Execute a native `clone` syscall to create a new thread in a new process.
124///
125/// This function returns in both the parent and the child.
126///
127/// # Safety
128///
129/// * `ctx` must be dereferenceable, and must be safe for the newly spawned
130///   child thread to restore.
131/// * Other pointers, if non-null, must be safely dereferenceable.
132/// * `child_stack` must be "sufficiently big" for the child thread to run on.
133/// * `tls` if provided must point to correctly initialized thread local storage.
134unsafe fn do_clone_process(ctx: &ucontext, event: &ShimEventAddThreadReq) -> i64 {
135    let flags = CloneFlags::from_bits(event.flags).unwrap();
136    assert!(!flags.contains(CloneFlags::CLONE_THREAD));
137    let ptid: *mut i32 = event.ptid.cast::<i32>().into_raw_mut();
138    let ctid: *mut i32 = event.ctid.cast::<i32>().into_raw_mut();
139    let child_stack: *mut u8 = event.child_stack.cast::<u8>().into_raw_mut();
140    let newtls = event.newtls;
141
142    if flags.contains(CloneFlags::CLONE_VM) {
143        // Don't know of a real-world need for this.
144        unimplemented!("fork with shared memory");
145    }
146    if flags.contains(CloneFlags::CLONE_VFORK) {
147        // We want to support this eventually, but will take some work.
148        unimplemented!("vfork");
149    }
150    if flags.contains(CloneFlags::CLONE_SETTLS) {
151        // In particular we don't correctly handle the case where the parent
152        // thread is using `tls::Mode::Native`, but the child thread is
153        // unable to.
154        //
155        // We could try to detect that more specific case and/or correctly
156        // handle it, but I don't think this is likely to be needed.
157        unimplemented!("CLONE_SETTLS without CLONE_THREAD");
158    }
159
160    // The shadow Process should be the parent; not this process.
161    assert!(flags.contains(CloneFlags::CLONE_PARENT));
162
163    let parent_tls_key = crate::SHIM_TLS.current_key();
164
165    let res = match unsafe {
166        linux_api::sched::clone(
167            flags,
168            Some(Signal::SIGCHLD),
169            // If a child stack is provided, we do the stack switch below
170            // as part of initialization instead of having the syscall do it for us.
171            // It's a bit simpler this way, and we can safely do it this way
172            // since we're not using CLONE_VM (not sharing memory with the
173            // parent).
174            core::ptr::null_mut(),
175            ptid,
176            ctid,
177            newtls as *mut linux_user_desc,
178        )
179    } {
180        Ok(r) => r,
181        Err(e) => return e.to_negated_i64(),
182    };
183    match res {
184        CloneResult::CallerIsChild => {
185            // SAFETY: We have exclusive access to SHIM_TLS: this is the only thread
186            // in the new process, and we're not sharing memory with the parent process.
187            unsafe { crate::SHIM_TLS.fork_from(parent_tls_key) };
188            // SAFETY: Shadow should give us the correct type and lifetime.
189            unsafe { crate::tls_ipc::set(&event.ipc_block) };
190            unsafe { crate::bindings::_shim_child_process_init_preload() };
191            if !child_stack.is_null() {
192                // Do the requested stack switch by long jumping out of the
193                // signal handler to an updated context.
194                ExecutionContext::Application.enter_without_restorer();
195                let mut mctx = ctx.uc_mcontext;
196                mctx.rsp = child_stack as u64;
197                unsafe { set_context(&mctx) };
198            }
199            0
200        }
201        CloneResult::CallerIsParent(child) => child.as_raw_nonzero().get().into(),
202    }
203}
204
205/// Execute a native `clone` syscall to create a new thread.  The newly created
206/// child thread will resume execution from `ctx`, which should be the point
207/// where the managed code originally made a `clone` syscall (but was
208/// intercepted by seccomp).
209///
210/// # Safety
211///
212/// * `ctx` must be dereferenceable, and must be safe for the newly spawned
213///   child thread to restore.
214/// * Other pointers, if non-null, must be safely dereferenceable.
215/// * `child_stack` must be "sufficiently big" for the child thread to run on.
216/// * `tls` if provided must point to correctly initialized thread local storage.
217unsafe fn do_clone_thread(ctx: &ucontext, event: &ShimEventAddThreadReq) -> i64 {
218    let flags = CloneFlags::from_bits(event.flags).unwrap();
219    assert!(flags.contains(CloneFlags::CLONE_THREAD));
220    let ptid: *mut i32 = event.ptid.cast::<i32>().into_raw_mut();
221    let ctid: *mut i32 = event.ctid.cast::<i32>().into_raw_mut();
222    let child_stack: *mut u8 = event.child_stack.cast::<u8>().into_raw_mut();
223    let newtls = event.newtls;
224
225    assert!(
226        !child_stack.is_null(),
227        "clone without a new stack not implemented"
228    );
229
230    // x86-64 calling conventions require a 16-byte aligned stack
231    assert_eq!(
232        child_stack.align_offset(16),
233        0,
234        "clone with unaligned new stack not implemented"
235    );
236
237    // Copy ctx to top of the child stack.
238    // SAFETY: Should still point within stack, assuming it fits.
239    let child_current_rsp = unsafe { child_stack.sub(core::mem::size_of::<sigcontext>()) };
240    let child_current_rsp =
241        unsafe { align_down(child_current_rsp, core::mem::align_of::<sigcontext>()) };
242    let child_sigcontext = child_current_rsp.cast::<sigcontext>();
243    unsafe { core::ptr::write(child_sigcontext, ctx.uc_mcontext) };
244
245    // Update child's copy of context to use the child's stack.
246    let child_sigctx = unsafe { child_sigcontext.as_mut().unwrap() };
247    child_sigctx.rsp = child_stack as u64;
248
249    // Copy child's IPC block to child's stack
250    let child_current_rsp =
251        unsafe { child_current_rsp.sub(core::mem::size_of::<ShMemBlockSerialized>()) };
252    let child_current_rsp = unsafe {
253        align_down(
254            child_current_rsp,
255            core::mem::align_of::<ShMemBlockSerialized>(),
256        )
257    };
258    let child_ipc_blk = child_current_rsp.cast::<ShMemBlockSerialized>();
259    unsafe { core::ptr::write(child_ipc_blk, event.ipc_block) };
260
261    // Ensure stack is 16-aligned so that we can safely make function calls.
262    let child_current_rsp = unsafe { align_down(child_current_rsp, 16) };
263
264    let rv: i64;
265    // SAFETY:
266    //
267    // This block makes the clone syscall, which is tricky because Rust currently
268    // doesn't have a way to tell the compiler that a block or function "returns twice".
269    // <https://github.com/rust-lang/libc/issues/1596>
270    //
271    // We work around this by using a single asm block to:
272    // * Make the `clone` syscall
273    // * Do the required per-thread shim initialization
274    // * Restore CPU state and *jump* to the point where the managed code was
275    // originally trying to make the syscall.
276    //
277    // The point we jump to should already be a point that was expecting to make
278    // the clone syscall, so should already correctly handle that both the
279    // parent and child thread resume execution there. (The parent thread
280    // resumes execution there after returning from the seccomp signal handler
281    // normally).
282    unsafe {
283        core::arch::asm!(
284            // Make the clone syscall
285            "syscall",
286            // If in the parent, exit the asm block (by jumping forward to the label
287            // `2`). https://doc.rust-lang.org/rust-by-example/unsafe/asm.html#labels
288            "cmp rax, 0",
289            "jne 2f",
290
291            // Set the current context to shadow
292            "mov rdi, {exe_ctx_shadow}",
293            "call {shim_swapExecutionContext}",
294
295            // Initialize the IPC block for this thread
296            "mov rdi, {blk}",
297            "call {tls_ipc_set}",
298
299            // Initialize state for this thread
300            "call {shim_init_thread}",
301
302            // Set the current context to application
303            "mov rdi, {exe_ctx_application}",
304            "call {shim_swapExecutionContext}",
305
306            // Set CPU state from ctx
307            "mov rdi, r12",
308            "call {set_context}",
309
310            "2:",
311            // clone syscall number in, rv out
312            inout("rax") libc::SYS_clone => rv,
313            // clone syscall arg1
314            in("rdi") flags.bits(),
315            // clone syscall arg2
316            in("rsi") child_current_rsp,
317            // clone syscall arg3
318            in("rdx") ptid,
319            // clone syscall arg4
320            in("r10") ctid,
321            // clone syscall arg5
322            in("r8") newtls,
323            blk = in(reg) child_ipc_blk,
324            exe_ctx_shadow = const crate::EXECUTION_CONTEXT_SHADOW_CONST,
325            exe_ctx_application = const crate::EXECUTION_CONTEXT_APPLICATION_CONST,
326            shim_swapExecutionContext = sym crate::export::shim_swapExecutionContext,
327            tls_ipc_set = sym tls_ipc_set,
328            shim_init_thread = sym crate::init_thread,
329            // callee-saved register
330            in("r12") child_sigcontext as * const _,
331            set_context = sym set_context,
332        )
333    }
334    rv
335}
336
337/// Execute a native `clone` syscall, creating a new thread, which may be in
338/// a new process (depending whether CLONE_THREAD is set).
339///
340/// If CLONE_THREAD is set, then the newly created child thread will resume
341/// execution from `ctx`, which should be the point where the managed code
342/// originally made a `clone` syscall (but was intercepted by seccomp).
343/// Otherwise this function will return normally in both the parent and child
344/// processes.
345///
346/// # Safety
347///
348/// * `ctx` must be dereferenceable, and must be safe for the newly spawned
349///   child thread to restore.
350/// * Other pointers, if non-null, must be safely dereferenceable.
351/// * `child_stack` must be "sufficiently big" for the child thread to run on.
352/// * `tls` if provided must point to correctly initialized thread local storage.
353pub unsafe fn do_clone(ctx: &ucontext, event: &ShimEventAddThreadReq) -> i64 {
354    let flags = CloneFlags::from_bits(event.flags).unwrap();
355
356    if flags.contains(CloneFlags::CLONE_THREAD) {
357        unsafe { do_clone_thread(ctx, event) }
358    } else {
359        unsafe { do_clone_process(ctx, event) }
360    }
361}