2
0

clone.rs 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314
  1. use core::arch::global_asm;
  2. use core::mem::size_of;
  3. use alloc::boxed::Box;
  4. use alloc::vec::Vec;
  5. use syscall::data::Map;
  6. use syscall::flag::{MapFlags, O_CLOEXEC};
  7. use syscall::error::{Error, Result, EINVAL, ENAMETOOLONG};
  8. use syscall::SIGCONT;
  9. use super::extra::{create_set_addr_space_buf, FdGuard};
  10. fn new_context() -> Result<(FdGuard, usize)> {
  11. // Create a new context (fields such as uid/gid will be inherited from the current context).
  12. let fd = FdGuard::new(syscall::open("thisproc:new/open_via_dup", O_CLOEXEC)?);
  13. // Extract pid.
  14. let mut buffer = [0_u8; 64];
  15. let len = syscall::fpath(*fd, &mut buffer)?;
  16. let buffer = buffer.get(..len).ok_or(Error::new(ENAMETOOLONG))?;
  17. let colon_idx = buffer.iter().position(|c| *c == b':').ok_or(Error::new(EINVAL))?;
  18. let slash_idx = buffer.iter().skip(colon_idx).position(|c| *c == b'/').ok_or(Error::new(EINVAL))? + colon_idx;
  19. let pid_bytes = buffer.get(colon_idx + 1..slash_idx).ok_or(Error::new(EINVAL))?;
  20. let pid_str = core::str::from_utf8(pid_bytes).map_err(|_| Error::new(EINVAL))?;
  21. let pid = pid_str.parse::<usize>().map_err(|_| Error::new(EINVAL))?;
  22. Ok((fd, pid))
  23. }
  24. fn copy_str(cur_pid_fd: usize, new_pid_fd: usize, key: &str) -> Result<()> {
  25. let cur_name_fd = FdGuard::new(syscall::dup(cur_pid_fd, key.as_bytes())?);
  26. let new_name_fd = FdGuard::new(syscall::dup(new_pid_fd, key.as_bytes())?);
  27. let mut buf = [0_u8; 256];
  28. let len = syscall::read(*cur_name_fd, &mut buf)?;
  29. let buf = buf.get(..len).ok_or(Error::new(ENAMETOOLONG))?;
  30. syscall::write(*new_name_fd, &buf)?;
  31. Ok(())
  32. }
  33. #[cfg(target_arch = "x86_64")]
  34. fn copy_env_regs(cur_pid_fd: usize, new_pid_fd: usize) -> Result<()> {
  35. // Copy environment registers.
  36. {
  37. let cur_env_regs_fd = FdGuard::new(syscall::dup(cur_pid_fd, b"regs/env")?);
  38. let new_env_regs_fd = FdGuard::new(syscall::dup(new_pid_fd, b"regs/env")?);
  39. let mut env_regs = syscall::EnvRegisters::default();
  40. let _ = syscall::read(*cur_env_regs_fd, &mut env_regs)?;
  41. let _ = syscall::write(*new_env_regs_fd, &env_regs)?;
  42. }
  43. Ok(())
  44. }
  45. /// Spawns a new context sharing the same address space as the current one (i.e. a new thread).
  46. pub unsafe fn pte_clone_impl(stack: *mut usize) -> Result<usize> {
  47. let cur_pid_fd = FdGuard::new(syscall::open("thisproc:current/open_via_dup", O_CLOEXEC)?);
  48. let (new_pid_fd, new_pid) = new_context()?;
  49. // Allocate a new signal stack.
  50. {
  51. let sigstack_fd = FdGuard::new(syscall::dup(*new_pid_fd, b"sigstack")?);
  52. const SIGSTACK_SIZE: usize = 1024 * 256;
  53. // TODO: Put sigstack at high addresses?
  54. let target_sigstack = syscall::fmap(!0, &Map { address: 0, flags: MapFlags::PROT_READ | MapFlags::PROT_WRITE | MapFlags::MAP_PRIVATE, offset: 0, size: SIGSTACK_SIZE })? + SIGSTACK_SIZE;
  55. let _ = syscall::write(*sigstack_fd, &usize::to_ne_bytes(target_sigstack))?;
  56. }
  57. copy_str(*cur_pid_fd, *new_pid_fd, "name")?;
  58. copy_str(*cur_pid_fd, *new_pid_fd, "cwd")?;
  59. // Reuse existing address space
  60. {
  61. let cur_addr_space_fd = FdGuard::new(syscall::dup(*cur_pid_fd, b"addrspace")?);
  62. let new_addr_space_sel_fd = FdGuard::new(syscall::dup(*new_pid_fd, b"current-addrspace")?);
  63. let buf = create_set_addr_space_buf(*cur_addr_space_fd, pte_clone_ret as usize, stack as usize);
  64. let _ = syscall::write(*new_addr_space_sel_fd, &buf)?;
  65. }
  66. // Reuse file table
  67. {
  68. let cur_filetable_fd = FdGuard::new(syscall::dup(*cur_pid_fd, b"filetable")?);
  69. let new_filetable_sel_fd = FdGuard::new(syscall::dup(*new_pid_fd, b"current-filetable")?);
  70. let _ = syscall::write(*new_filetable_sel_fd, &usize::to_ne_bytes(*cur_filetable_fd))?;
  71. }
  72. // Reuse sigactions (on Linux, CLONE_THREAD requires CLONE_SIGHAND which implies the sigactions
  73. // table is reused).
  74. {
  75. let cur_sigaction_fd = FdGuard::new(syscall::dup(*cur_pid_fd, b"sigactions")?);
  76. let new_sigaction_sel_fd = FdGuard::new(syscall::dup(*new_pid_fd, b"current-sigactions")?);
  77. let _ = syscall::write(*new_sigaction_sel_fd, &usize::to_ne_bytes(*cur_sigaction_fd))?;
  78. }
  79. copy_env_regs(*cur_pid_fd, *new_pid_fd)?;
  80. // Unblock context.
  81. syscall::kill(new_pid, SIGCONT)?;
  82. Ok(0)
  83. }
  84. /// Spawns a new context which will not share the same address space as the current one. File
  85. /// descriptors from other schemes are reobtained with `dup`, and grants referencing such file
  86. /// descriptors are reobtained through `fmap`. Other mappings are kept but duplicated using CoW.
  87. pub fn fork_impl() -> Result<usize> {
  88. unsafe {
  89. Error::demux(fork_wrapper())
  90. }
  91. }
  92. fn fork_inner(initial_rsp: *mut usize) -> Result<usize> {
  93. let (cur_filetable_fd, new_pid_fd, new_pid);
  94. {
  95. let cur_pid_fd = FdGuard::new(syscall::open("thisproc:current/open_via_dup", O_CLOEXEC)?);
  96. (new_pid_fd, new_pid) = new_context()?;
  97. // Do not allocate new signal stack, but copy existing address (all memory will be re-mapped
  98. // CoW later).
  99. {
  100. let cur_sigstack_fd = FdGuard::new(syscall::dup(*cur_pid_fd, b"sigstack")?);
  101. let new_sigstack_fd = FdGuard::new(syscall::dup(*new_pid_fd, b"sigstack")?);
  102. let mut sigstack_buf = usize::to_ne_bytes(0);
  103. let _ = syscall::read(*cur_sigstack_fd, &mut sigstack_buf);
  104. let _ = syscall::write(*new_sigstack_fd, &sigstack_buf);
  105. }
  106. copy_str(*cur_pid_fd, *new_pid_fd, "name")?;
  107. copy_str(*cur_pid_fd, *new_pid_fd, "cwd")?;
  108. {
  109. let cur_sigaction_fd = FdGuard::new(syscall::dup(*cur_pid_fd, b"sigactions")?);
  110. let new_sigaction_fd = FdGuard::new(syscall::dup(*cur_sigaction_fd, b"copy")?);
  111. let new_sigaction_sel_fd = FdGuard::new(syscall::dup(*new_pid_fd, b"current-sigactions")?);
  112. let _ = syscall::write(*new_sigaction_sel_fd, &usize::to_ne_bytes(*new_sigaction_fd))?;
  113. }
  114. // Copy existing files into new file table, but do not reuse the same file table (i.e. new
  115. // parent FDs will not show up for the child).
  116. {
  117. cur_filetable_fd = FdGuard::new(syscall::dup(*cur_pid_fd, b"filetable")?);
  118. // This must be done before the address space is copied.
  119. unsafe {
  120. initial_rsp.write(*cur_filetable_fd);
  121. initial_rsp.add(1).write(*new_pid_fd);
  122. }
  123. }
  124. // CoW-duplicate address space.
  125. {
  126. let cur_addr_space_fd = FdGuard::new(syscall::dup(*cur_pid_fd, b"addrspace")?);
  127. // FIXME: Find mappings which use external file descriptors
  128. let new_addr_space_fd = FdGuard::new(syscall::dup(*cur_addr_space_fd, b"exclusive")?);
  129. let mut buf = vec! [0_u8; 4096];
  130. let mut bytes_read = 0;
  131. loop {
  132. let new_bytes_read = syscall::read(*cur_addr_space_fd, &mut buf[bytes_read..])?;
  133. if new_bytes_read == 0 { break }
  134. bytes_read += new_bytes_read;
  135. }
  136. let bytes = &buf[..bytes_read];
  137. for struct_bytes in bytes.array_chunks::<{size_of::<usize>() * 4}>() {
  138. let mut words = struct_bytes.array_chunks::<{size_of::<usize>()}>().copied().map(usize::from_ne_bytes);
  139. let addr = words.next().unwrap();
  140. let size = words.next().unwrap();
  141. let flags = words.next().unwrap();
  142. let offset = words.next().unwrap();
  143. if flags & 0x8000_0000 == 0 {
  144. continue;
  145. }
  146. let map_flags = MapFlags::from_bits_truncate(flags);
  147. let mapped_address = unsafe {
  148. let fd = FdGuard::new(syscall::dup(*cur_addr_space_fd, format!("grant-{:x}", addr).as_bytes())?);
  149. syscall::fmap(*fd, &syscall::Map { address: 0, size, flags: map_flags, offset })?
  150. };
  151. let mut buf = [0_u8; size_of::<usize>() * 4];
  152. let mut chunks = buf.array_chunks_mut::<{size_of::<usize>()}>();
  153. *chunks.next().unwrap() = usize::to_ne_bytes(addr);
  154. *chunks.next().unwrap() = usize::to_ne_bytes(size);
  155. *chunks.next().unwrap() = usize::to_ne_bytes(map_flags.bits());
  156. *chunks.next().unwrap() = usize::to_ne_bytes(mapped_address);
  157. let _ = syscall::write(*new_addr_space_fd, &buf)?;
  158. }
  159. let new_addr_space_sel_fd = FdGuard::new(syscall::dup(*new_pid_fd, b"current-addrspace")?);
  160. let buf = create_set_addr_space_buf(*new_addr_space_fd, fork_ret as usize, initial_rsp as usize);
  161. let _ = syscall::write(*new_addr_space_sel_fd, &buf)?;
  162. }
  163. copy_env_regs(*cur_pid_fd, *new_pid_fd)?;
  164. }
  165. // Copy the file table. We do this last to ensure that all previously used file descriptors are
  166. // closed. The only exception -- the filetable selection fd and the current filetable fd --
  167. // will be closed by the child process.
  168. {
  169. // TODO: Use cross_scheme_links or something similar to avoid copying the file table in the
  170. // kernel.
  171. let new_filetable_fd = FdGuard::new(syscall::dup(*cur_filetable_fd, b"copy")?);
  172. let new_filetable_sel_fd = FdGuard::new(syscall::dup(*new_pid_fd, b"current-filetable")?);
  173. let _ = syscall::write(*new_filetable_sel_fd, &usize::to_ne_bytes(*new_filetable_fd));
  174. }
  175. // Unblock context.
  176. syscall::kill(new_pid, SIGCONT)?;
  177. Ok(new_pid)
  178. }
  179. #[no_mangle]
  180. unsafe extern "sysv64" fn __relibc_internal_fork_impl(initial_rsp: *mut usize) -> usize {
  181. Error::mux(fork_inner(initial_rsp))
  182. }
  183. #[no_mangle]
  184. unsafe extern "sysv64" fn __relibc_internal_fork_hook(cur_filetable_fd: usize, new_pid_fd: usize) {
  185. let _ = syscall::close(cur_filetable_fd);
  186. let _ = syscall::close(new_pid_fd);
  187. }
  188. #[no_mangle]
  189. core::arch::global_asm!("
  190. .p2align 6
  191. .globl fork_wrapper
  192. .type fork_wrapper, @function
  193. fork_wrapper:
  194. push rbp
  195. mov rbp, rsp
  196. push rbx
  197. push rbp
  198. push r12
  199. push r13
  200. push r14
  201. push r15
  202. sub rsp, 32
  203. stmxcsr [rsp+16]
  204. fnstcw [rsp+24]
  205. mov rdi, rsp
  206. call __relibc_internal_fork_impl
  207. jmp 2f
  208. fork_ret:
  209. mov rdi, [rsp]
  210. mov rsi, [rsp + 8]
  211. call __relibc_internal_fork_hook
  212. ldmxcsr [rsp+16]
  213. fldcw [rsp+24]
  214. xor rax, rax
  215. 2:
  216. add rsp, 32
  217. pop r15
  218. pop r14
  219. pop r13
  220. pop r12
  221. pop rbp
  222. pop rbx
  223. pop rbp
  224. ret
  225. .size fork_wrapper, . - fork_wrapper
  226. .globl pte_clone_ret
  227. .type pte_clone_ret, @function
  228. pte_clone_ret:
  229. # Load registers
  230. pop rax
  231. pop rdi
  232. pop rsi
  233. pop rdx
  234. pop rcx
  235. pop r8
  236. pop r9
  237. # Call entry point
  238. call rax
  239. ret
  240. .size pte_clone_ret, . - pte_clone_ret
  241. ");
  242. extern "sysv64" {
  243. fn fork_wrapper() -> usize;
  244. fn fork_ret();
  245. fn pte_clone_ret();
  246. }