shadow_shim/clone.rs
1use linux_api::ldt::linux_user_desc;
2use linux_api::sched::{CloneFlags, CloneResult};
3use linux_api::signal::Signal;
4use linux_api::ucontext::{sigcontext, ucontext};
5use shadow_shim_helper_rs::shim_event::ShimEventAddThreadReq;
6use shadow_shmem::allocator::ShMemBlockSerialized;
7
8use crate::ExecutionContext;
9
10/// Used below to validate the offset of `field` from `base`.
11/// TODO: replace with `core::ptr::offset_of` once stabilized.
12/// https://github.com/rust-lang/rust/issues/106655
13fn sigcontext_offset_of(base: &sigcontext, field: &u64) -> usize {
14 let base = core::ptr::from_ref(base) as usize;
15 let field = core::ptr::from_ref(field) as usize;
16 field - base
17}
18
19/// Round `ptr` down to a value that has alignment `align`. Useful when
20/// allocating on a stack that grows downward.
21///
22/// Panics if `align` isn't a power of 2.
23///
24/// # Safety
25///
26/// The resulting aligned pointer must be part of the same allocation as `ptr`.
27/// e.g. the stack that `ptr` points into must have enough room remaining to do
28/// the alignment.
29unsafe fn align_down(ptr: *mut u8, align: usize) -> *mut u8 {
30 assert!(align.is_power_of_two());
31 // Mask off enough low-order bits to ensure proper alignment.
32 let ptr = ptr as usize;
33 let ptr = ptr & !(align - 1);
34 ptr as *mut u8
35}
36
37/// Helper for `do_clone`. Restores all general purpose registers, stack pointer,
38/// and instruction pointer from `ctx` except for `rax`, which is set to 0.
39///
40/// # Safety
41///
42/// `ctx` must be safe to restore.
43///
44/// This is difficult to characterize in a general sense, but e.g. minimally the
45/// stack and instruction pointers must be valid, and other register values must
46/// correspond to "sound" values of whatever state they correspond to at that
47/// instruction pointer.
48unsafe extern "C-unwind" fn set_context(ctx: &sigcontext) -> ! {
49 // These offsets are hard-coded into the asm format string below.
50 // TODO: turn these into const parameters to the asm block when const
51 // asm parameters are stabilized.
52 // https://github.com/rust-lang/rust/issues/93332
53 debug_assert_eq!(sigcontext_offset_of(ctx, &ctx.r8), 0);
54 debug_assert_eq!(sigcontext_offset_of(ctx, &ctx.r9), 0x8);
55 debug_assert_eq!(sigcontext_offset_of(ctx, &ctx.r10), 0x10);
56 debug_assert_eq!(sigcontext_offset_of(ctx, &ctx.r11), 0x18);
57 debug_assert_eq!(sigcontext_offset_of(ctx, &ctx.r12), 0x20);
58 debug_assert_eq!(sigcontext_offset_of(ctx, &ctx.r13), 0x28);
59 debug_assert_eq!(sigcontext_offset_of(ctx, &ctx.r14), 0x30);
60 debug_assert_eq!(sigcontext_offset_of(ctx, &ctx.r15), 0x38);
61 debug_assert_eq!(sigcontext_offset_of(ctx, &ctx.rsi), 0x48);
62 debug_assert_eq!(sigcontext_offset_of(ctx, &ctx.rdi), 0x40);
63 debug_assert_eq!(sigcontext_offset_of(ctx, &ctx.rbx), 0x58);
64 debug_assert_eq!(sigcontext_offset_of(ctx, &ctx.rdx), 0x60);
65 debug_assert_eq!(sigcontext_offset_of(ctx, &ctx.rbp), 0x50);
66 debug_assert_eq!(sigcontext_offset_of(ctx, &ctx.rip), 0x80);
67 debug_assert_eq!(sigcontext_offset_of(ctx, &ctx.rsp), 0x78);
68
69 unsafe {
70 core::arch::asm!(
71 // Restore general purpose registers.
72 // Offsets are validated in assertions above.
73 "mov r8, [rax+0x0]",
74 "mov r9, [rax+0x8]",
75 "mov r10, [rax+0x10]",
76 "mov r11, [rax+0x18]",
77 "mov r12, [rax+0x20]",
78 "mov r13, [rax+0x28]",
79 "mov r14, [rax+0x30]",
80 "mov r15, [rax+0x38]",
81 "mov rsi, [rax+0x48]",
82 "mov rdi, [rax+0x40]",
83 "mov rbx, [rax+0x58]",
84 "mov rdx, [rax+0x60]",
85 "mov rbp, [rax+0x50]",
86 "mov rsp, [rax+0x78]",
87
88 // Push `ctx`'s `rip` to stack
89 "mov rax, [rax+0x80]",
90 "push rax",
91
92 // Not restored:
93 // - `rax`: stores the result of the syscall, which we set below.
94 // - Floating point and other special registers: hopefully not needed.
95
96 // Set `rax` to 0
97 "mov rax, 0",
98
99 // Ret to ctx's `rip`
100 "ret",
101 in("rax") core::ptr::from_ref(ctx),
102 options(noreturn)
103 )
104 };
105}
106
107/// `extern "C-unwind"` wrapper for `crate::tls_ipc::set`, which we can call from
108/// assembly.
109///
110/// # Safety
111///
112/// `blk` must contained a serialized block of
113/// type `IPCData`, which outlives the current thread.
114unsafe extern "C-unwind" fn tls_ipc_set(blk: *const ShMemBlockSerialized) {
115 debug_assert_eq!(ExecutionContext::current(), ExecutionContext::Shadow);
116
117 let blk = unsafe { blk.as_ref().unwrap() };
118
119 // SAFETY: ensured by caller
120 unsafe { crate::tls_ipc::set(blk) };
121}
122
123/// Execute a native `clone` syscall to create a new thread in a new process.
124///
125/// This function returns in both the parent and the child.
126///
127/// # Safety
128///
129/// * `ctx` must be dereferenceable, and must be safe for the newly spawned
130/// child thread to restore.
131/// * Other pointers, if non-null, must be safely dereferenceable.
132/// * `child_stack` must be "sufficiently big" for the child thread to run on.
133/// * `tls` if provided must point to correctly initialized thread local storage.
134unsafe fn do_clone_process(ctx: &ucontext, event: &ShimEventAddThreadReq) -> i64 {
135 let flags = CloneFlags::from_bits(event.flags).unwrap();
136 assert!(!flags.contains(CloneFlags::CLONE_THREAD));
137 let ptid: *mut i32 = event.ptid.cast::<i32>().into_raw_mut();
138 let ctid: *mut i32 = event.ctid.cast::<i32>().into_raw_mut();
139 let child_stack: *mut u8 = event.child_stack.cast::<u8>().into_raw_mut();
140 let newtls = event.newtls;
141
142 if flags.contains(CloneFlags::CLONE_VM) {
143 // Don't know of a real-world need for this.
144 unimplemented!("fork with shared memory");
145 }
146 if flags.contains(CloneFlags::CLONE_VFORK) {
147 // We want to support this eventually, but will take some work.
148 unimplemented!("vfork");
149 }
150 if flags.contains(CloneFlags::CLONE_SETTLS) {
151 // In particular we don't correctly handle the case where the parent
152 // thread is using `tls::Mode::Native`, but the child thread is
153 // unable to.
154 //
155 // We could try to detect that more specific case and/or correctly
156 // handle it, but I don't think this is likely to be needed.
157 unimplemented!("CLONE_SETTLS without CLONE_THREAD");
158 }
159
160 // The shadow Process should be the parent; not this process.
161 assert!(flags.contains(CloneFlags::CLONE_PARENT));
162
163 let parent_tls_key = crate::SHIM_TLS.current_key();
164
165 let res = match unsafe {
166 linux_api::sched::clone(
167 flags,
168 Some(Signal::SIGCHLD),
169 // If a child stack is provided, we do the stack switch below
170 // as part of initialization instead of having the syscall do it for us.
171 // It's a bit simpler this way, and we can safely do it this way
172 // since we're not using CLONE_VM (not sharing memory with the
173 // parent).
174 core::ptr::null_mut(),
175 ptid,
176 ctid,
177 newtls as *mut linux_user_desc,
178 )
179 } {
180 Ok(r) => r,
181 Err(e) => return e.to_negated_i64(),
182 };
183 match res {
184 CloneResult::CallerIsChild => {
185 // SAFETY: We have exclusive access to SHIM_TLS: this is the only thread
186 // in the new process, and we're not sharing memory with the parent process.
187 unsafe { crate::SHIM_TLS.fork_from(parent_tls_key) };
188 // SAFETY: Shadow should give us the correct type and lifetime.
189 unsafe { crate::tls_ipc::set(&event.ipc_block) };
190 unsafe { crate::bindings::_shim_child_process_init_preload() };
191 if !child_stack.is_null() {
192 // Do the requested stack switch by long jumping out of the
193 // signal handler to an updated context.
194 ExecutionContext::Application.enter_without_restorer();
195 let mut mctx = ctx.uc_mcontext;
196 mctx.rsp = child_stack as u64;
197 unsafe { set_context(&mctx) };
198 }
199 0
200 }
201 CloneResult::CallerIsParent(child) => child.as_raw_nonzero().get().into(),
202 }
203}
204
205/// Execute a native `clone` syscall to create a new thread. The newly created
206/// child thread will resume execution from `ctx`, which should be the point
207/// where the managed code originally made a `clone` syscall (but was
208/// intercepted by seccomp).
209///
210/// # Safety
211///
212/// * `ctx` must be dereferenceable, and must be safe for the newly spawned
213/// child thread to restore.
214/// * Other pointers, if non-null, must be safely dereferenceable.
215/// * `child_stack` must be "sufficiently big" for the child thread to run on.
216/// * `tls` if provided must point to correctly initialized thread local storage.
217unsafe fn do_clone_thread(ctx: &ucontext, event: &ShimEventAddThreadReq) -> i64 {
218 let flags = CloneFlags::from_bits(event.flags).unwrap();
219 assert!(flags.contains(CloneFlags::CLONE_THREAD));
220 let ptid: *mut i32 = event.ptid.cast::<i32>().into_raw_mut();
221 let ctid: *mut i32 = event.ctid.cast::<i32>().into_raw_mut();
222 let child_stack: *mut u8 = event.child_stack.cast::<u8>().into_raw_mut();
223 let newtls = event.newtls;
224
225 assert!(
226 !child_stack.is_null(),
227 "clone without a new stack not implemented"
228 );
229
230 // x86-64 calling conventions require a 16-byte aligned stack
231 assert_eq!(
232 child_stack.align_offset(16),
233 0,
234 "clone with unaligned new stack not implemented"
235 );
236
237 // Copy ctx to top of the child stack.
238 // SAFETY: Should still point within stack, assuming it fits.
239 let child_current_rsp = unsafe { child_stack.sub(core::mem::size_of::<sigcontext>()) };
240 let child_current_rsp =
241 unsafe { align_down(child_current_rsp, core::mem::align_of::<sigcontext>()) };
242 let child_sigcontext = child_current_rsp.cast::<sigcontext>();
243 unsafe { core::ptr::write(child_sigcontext, ctx.uc_mcontext) };
244
245 // Update child's copy of context to use the child's stack.
246 let child_sigctx = unsafe { child_sigcontext.as_mut().unwrap() };
247 child_sigctx.rsp = child_stack as u64;
248
249 // Copy child's IPC block to child's stack
250 let child_current_rsp =
251 unsafe { child_current_rsp.sub(core::mem::size_of::<ShMemBlockSerialized>()) };
252 let child_current_rsp = unsafe {
253 align_down(
254 child_current_rsp,
255 core::mem::align_of::<ShMemBlockSerialized>(),
256 )
257 };
258 let child_ipc_blk = child_current_rsp.cast::<ShMemBlockSerialized>();
259 unsafe { core::ptr::write(child_ipc_blk, event.ipc_block) };
260
261 // Ensure stack is 16-aligned so that we can safely make function calls.
262 let child_current_rsp = unsafe { align_down(child_current_rsp, 16) };
263
264 let rv: i64;
265 // SAFETY:
266 //
267 // This block makes the clone syscall, which is tricky because Rust currently
268 // doesn't have a way to tell the compiler that a block or function "returns twice".
269 // <https://github.com/rust-lang/libc/issues/1596>
270 //
271 // We work around this by using a single asm block to:
272 // * Make the `clone` syscall
273 // * Do the required per-thread shim initialization
274 // * Restore CPU state and *jump* to the point where the managed code was
275 // originally trying to make the syscall.
276 //
277 // The point we jump to should already be a point that was expecting to make
278 // the clone syscall, so should already correctly handle that both the
279 // parent and child thread resume execution there. (The parent thread
280 // resumes execution there after returning from the seccomp signal handler
281 // normally).
282 unsafe {
283 core::arch::asm!(
284 // Make the clone syscall
285 "syscall",
286 // If in the parent, exit the asm block (by jumping forward to the label
287 // `2`). https://doc.rust-lang.org/rust-by-example/unsafe/asm.html#labels
288 "cmp rax, 0",
289 "jne 2f",
290
291 // Set the current context to shadow
292 "mov rdi, {exe_ctx_shadow}",
293 "call {shim_swapExecutionContext}",
294
295 // Initialize the IPC block for this thread
296 "mov rdi, {blk}",
297 "call {tls_ipc_set}",
298
299 // Initialize state for this thread
300 "call {shim_init_thread}",
301
302 // Set the current context to application
303 "mov rdi, {exe_ctx_application}",
304 "call {shim_swapExecutionContext}",
305
306 // Set CPU state from ctx
307 "mov rdi, r12",
308 "call {set_context}",
309
310 "2:",
311 // clone syscall number in, rv out
312 inout("rax") libc::SYS_clone => rv,
313 // clone syscall arg1
314 in("rdi") flags.bits(),
315 // clone syscall arg2
316 in("rsi") child_current_rsp,
317 // clone syscall arg3
318 in("rdx") ptid,
319 // clone syscall arg4
320 in("r10") ctid,
321 // clone syscall arg5
322 in("r8") newtls,
323 blk = in(reg) child_ipc_blk,
324 exe_ctx_shadow = const crate::EXECUTION_CONTEXT_SHADOW_CONST,
325 exe_ctx_application = const crate::EXECUTION_CONTEXT_APPLICATION_CONST,
326 shim_swapExecutionContext = sym crate::export::shim_swapExecutionContext,
327 tls_ipc_set = sym tls_ipc_set,
328 shim_init_thread = sym crate::init_thread,
329 // callee-saved register
330 in("r12") child_sigcontext as * const _,
331 set_context = sym set_context,
332 )
333 }
334 rv
335}
336
337/// Execute a native `clone` syscall, creating a new thread, which may be in
338/// a new process (depending whether CLONE_THREAD is set).
339///
340/// If CLONE_THREAD is set, then the newly created child thread will resume
341/// execution from `ctx`, which should be the point where the managed code
342/// originally made a `clone` syscall (but was intercepted by seccomp).
343/// Otherwise this function will return normally in both the parent and child
344/// processes.
345///
346/// # Safety
347///
348/// * `ctx` must be dereferenceable, and must be safe for the newly spawned
349/// child thread to restore.
350/// * Other pointers, if non-null, must be safely dereferenceable.
351/// * `child_stack` must be "sufficiently big" for the child thread to run on.
352/// * `tls` if provided must point to correctly initialized thread local storage.
353pub unsafe fn do_clone(ctx: &ucontext, event: &ShimEventAddThreadReq) -> i64 {
354 let flags = CloneFlags::from_bits(event.flags).unwrap();
355
356 if flags.contains(CloneFlags::CLONE_THREAD) {
357 unsafe { do_clone_thread(ctx, event) }
358 } else {
359 unsafe { do_clone_process(ctx, event) }
360 }
361}