2
0

clone.rs 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327
  1. use core::arch::global_asm;
  2. use core::mem::size_of;
  3. use alloc::boxed::Box;
  4. use alloc::vec::Vec;
  5. use syscall::data::Map;
  6. use syscall::flag::{MapFlags, O_CLOEXEC};
  7. use syscall::error::{Error, Result, EINVAL, ENAMETOOLONG};
  8. use syscall::SIGCONT;
  9. use super::extra::{create_set_addr_space_buf, FdGuard};
  10. fn new_context() -> Result<(FdGuard, usize)> {
  11. // Create a new context (fields such as uid/gid will be inherited from the current context).
  12. let fd = FdGuard::new(syscall::open("thisproc:new/open_via_dup", O_CLOEXEC)?);
  13. // Extract pid.
  14. let mut buffer = [0_u8; 64];
  15. let len = syscall::fpath(*fd, &mut buffer)?;
  16. let buffer = buffer.get(..len).ok_or(Error::new(ENAMETOOLONG))?;
  17. let colon_idx = buffer.iter().position(|c| *c == b':').ok_or(Error::new(EINVAL))?;
  18. let slash_idx = buffer.iter().skip(colon_idx).position(|c| *c == b'/').ok_or(Error::new(EINVAL))? + colon_idx;
  19. let pid_bytes = buffer.get(colon_idx + 1..slash_idx).ok_or(Error::new(EINVAL))?;
  20. let pid_str = core::str::from_utf8(pid_bytes).map_err(|_| Error::new(EINVAL))?;
  21. let pid = pid_str.parse::<usize>().map_err(|_| Error::new(EINVAL))?;
  22. Ok((fd, pid))
  23. }
  24. fn copy_str(cur_pid_fd: usize, new_pid_fd: usize, key: &str) -> Result<()> {
  25. let cur_name_fd = FdGuard::new(syscall::dup(cur_pid_fd, key.as_bytes())?);
  26. let new_name_fd = FdGuard::new(syscall::dup(new_pid_fd, key.as_bytes())?);
  27. // TODO: Max path size?
  28. let mut buf = [0_u8; 256];
  29. let len = syscall::read(*cur_name_fd, &mut buf)?;
  30. let buf = buf.get(..len).ok_or(Error::new(ENAMETOOLONG))?;
  31. syscall::write(*new_name_fd, &buf)?;
  32. Ok(())
  33. }
  34. #[cfg(target_arch = "x86_64")]
  35. fn copy_env_regs(cur_pid_fd: usize, new_pid_fd: usize) -> Result<()> {
  36. // Copy environment registers.
  37. {
  38. let cur_env_regs_fd = FdGuard::new(syscall::dup(cur_pid_fd, b"regs/env")?);
  39. let new_env_regs_fd = FdGuard::new(syscall::dup(new_pid_fd, b"regs/env")?);
  40. let mut env_regs = syscall::EnvRegisters::default();
  41. let _ = syscall::read(*cur_env_regs_fd, &mut env_regs)?;
  42. let _ = syscall::write(*new_env_regs_fd, &env_regs)?;
  43. }
  44. Ok(())
  45. }
  46. /// Spawns a new context sharing the same address space as the current one (i.e. a new thread).
  47. pub unsafe fn pte_clone_impl(stack: *mut usize) -> Result<usize> {
  48. let cur_pid_fd = FdGuard::new(syscall::open("thisproc:current/open_via_dup", O_CLOEXEC)?);
  49. let (new_pid_fd, new_pid) = new_context()?;
  50. // Allocate a new signal stack.
  51. {
  52. let sigstack_fd = FdGuard::new(syscall::dup(*new_pid_fd, b"sigstack")?);
  53. const SIGSTACK_SIZE: usize = 1024 * 256;
  54. // TODO: Put sigstack at high addresses?
  55. let target_sigstack = syscall::fmap(!0, &Map { address: 0, flags: MapFlags::PROT_READ | MapFlags::PROT_WRITE | MapFlags::MAP_PRIVATE, offset: 0, size: SIGSTACK_SIZE })? + SIGSTACK_SIZE;
  56. let _ = syscall::write(*sigstack_fd, &usize::to_ne_bytes(target_sigstack))?;
  57. }
  58. copy_str(*cur_pid_fd, *new_pid_fd, "name")?;
  59. copy_str(*cur_pid_fd, *new_pid_fd, "cwd")?;
  60. // Reuse existing address space
  61. {
  62. let cur_addr_space_fd = FdGuard::new(syscall::dup(*cur_pid_fd, b"addrspace")?);
  63. let new_addr_space_sel_fd = FdGuard::new(syscall::dup(*new_pid_fd, b"current-addrspace")?);
  64. let buf = create_set_addr_space_buf(*cur_addr_space_fd, __relibc_internal_pte_clone_ret as usize, stack as usize);
  65. let _ = syscall::write(*new_addr_space_sel_fd, &buf)?;
  66. }
  67. // Reuse file table
  68. {
  69. let cur_filetable_fd = FdGuard::new(syscall::dup(*cur_pid_fd, b"filetable")?);
  70. let new_filetable_sel_fd = FdGuard::new(syscall::dup(*new_pid_fd, b"current-filetable")?);
  71. let _ = syscall::write(*new_filetable_sel_fd, &usize::to_ne_bytes(*cur_filetable_fd))?;
  72. }
  73. // Reuse sigactions (on Linux, CLONE_THREAD requires CLONE_SIGHAND which implies the sigactions
  74. // table is reused).
  75. {
  76. let cur_sigaction_fd = FdGuard::new(syscall::dup(*cur_pid_fd, b"sigactions")?);
  77. let new_sigaction_sel_fd = FdGuard::new(syscall::dup(*new_pid_fd, b"current-sigactions")?);
  78. let _ = syscall::write(*new_sigaction_sel_fd, &usize::to_ne_bytes(*cur_sigaction_fd))?;
  79. }
  80. copy_env_regs(*cur_pid_fd, *new_pid_fd)?;
  81. // Unblock context.
  82. syscall::kill(new_pid, SIGCONT)?;
  83. let _ = syscall::waitpid(new_pid, &mut 0, syscall::WUNTRACED | syscall::WCONTINUED);
  84. Ok(0)
  85. }
  86. /// Spawns a new context which will not share the same address space as the current one. File
  87. /// descriptors from other schemes are reobtained with `dup`, and grants referencing such file
  88. /// descriptors are reobtained through `fmap`. Other mappings are kept but duplicated using CoW.
  89. pub fn fork_impl() -> Result<usize> {
  90. unsafe {
  91. Error::demux(__relibc_internal_fork_wrapper())
  92. }
  93. }
  94. fn fork_inner(initial_rsp: *mut usize) -> Result<usize> {
  95. let (cur_filetable_fd, new_pid_fd, new_pid);
  96. {
  97. let cur_pid_fd = FdGuard::new(syscall::open("thisproc:current/open_via_dup", O_CLOEXEC)?);
  98. (new_pid_fd, new_pid) = new_context()?;
  99. // Do not allocate new signal stack, but copy existing address (all memory will be re-mapped
  100. // CoW later).
  101. {
  102. let cur_sigstack_fd = FdGuard::new(syscall::dup(*cur_pid_fd, b"sigstack")?);
  103. let new_sigstack_fd = FdGuard::new(syscall::dup(*new_pid_fd, b"sigstack")?);
  104. let mut sigstack_buf = usize::to_ne_bytes(0);
  105. let _ = syscall::read(*cur_sigstack_fd, &mut sigstack_buf);
  106. let _ = syscall::write(*new_sigstack_fd, &sigstack_buf);
  107. }
  108. copy_str(*cur_pid_fd, *new_pid_fd, "name")?;
  109. copy_str(*cur_pid_fd, *new_pid_fd, "cwd")?;
  110. {
  111. let cur_sigaction_fd = FdGuard::new(syscall::dup(*cur_pid_fd, b"sigactions")?);
  112. let new_sigaction_fd = FdGuard::new(syscall::dup(*cur_sigaction_fd, b"copy")?);
  113. let new_sigaction_sel_fd = FdGuard::new(syscall::dup(*new_pid_fd, b"current-sigactions")?);
  114. let _ = syscall::write(*new_sigaction_sel_fd, &usize::to_ne_bytes(*new_sigaction_fd))?;
  115. }
  116. // Copy existing files into new file table, but do not reuse the same file table (i.e. new
  117. // parent FDs will not show up for the child).
  118. {
  119. cur_filetable_fd = FdGuard::new(syscall::dup(*cur_pid_fd, b"filetable")?);
  120. // This must be done before the address space is copied.
  121. unsafe {
  122. initial_rsp.write(*cur_filetable_fd);
  123. initial_rsp.add(1).write(*new_pid_fd);
  124. }
  125. }
  126. // CoW-duplicate address space.
  127. {
  128. let cur_addr_space_fd = FdGuard::new(syscall::dup(*cur_pid_fd, b"addrspace")?);
  129. // FIXME: Find mappings which use external file descriptors
  130. let new_addr_space_fd = FdGuard::new(syscall::dup(*cur_addr_space_fd, b"exclusive")?);
  131. let mut buf = vec! [0_u8; 4096];
  132. let mut bytes_read = 0;
  133. loop {
  134. let new_bytes_read = syscall::read(*cur_addr_space_fd, &mut buf[bytes_read..])?;
  135. if new_bytes_read == 0 { break }
  136. bytes_read += new_bytes_read;
  137. }
  138. let bytes = &buf[..bytes_read];
  139. for struct_bytes in bytes.array_chunks::<{size_of::<usize>() * 4}>() {
  140. let mut words = struct_bytes.array_chunks::<{size_of::<usize>()}>().copied().map(usize::from_ne_bytes);
  141. let addr = words.next().unwrap();
  142. let size = words.next().unwrap();
  143. let flags = words.next().unwrap();
  144. let offset = words.next().unwrap();
  145. if flags & 0x8000_0000 == 0 {
  146. continue;
  147. }
  148. let map_flags = MapFlags::from_bits_truncate(flags);
  149. let grant_fd = FdGuard::new(syscall::dup(*cur_addr_space_fd, format!("grant-{:x}", addr).as_bytes())?);
  150. redox_exec::mmap_remote(&new_addr_space_fd, &grant_fd, offset, addr, size, map_flags)?;
  151. }
  152. let new_addr_space_sel_fd = FdGuard::new(syscall::dup(*new_pid_fd, b"current-addrspace")?);
  153. let buf = create_set_addr_space_buf(*new_addr_space_fd, __relibc_internal_fork_ret as usize, initial_rsp as usize);
  154. let _ = syscall::write(*new_addr_space_sel_fd, &buf)?;
  155. }
  156. copy_env_regs(*cur_pid_fd, *new_pid_fd)?;
  157. }
  158. // Copy the file table. We do this last to ensure that all previously used file descriptors are
  159. // closed. The only exception -- the filetable selection fd and the current filetable fd --
  160. // will be closed by the child process.
  161. {
  162. // TODO: Use cross_scheme_links or something similar to avoid copying the file table in the
  163. // kernel.
  164. let new_filetable_fd = FdGuard::new(syscall::dup(*cur_filetable_fd, b"copy")?);
  165. let new_filetable_sel_fd = FdGuard::new(syscall::dup(*new_pid_fd, b"current-filetable")?);
  166. let _ = syscall::write(*new_filetable_sel_fd, &usize::to_ne_bytes(*new_filetable_fd));
  167. }
  168. // Unblock context.
  169. syscall::kill(new_pid, SIGCONT)?;
  170. // XXX: Killing with SIGCONT will put (pid, 65536) at key (pid, pgid) into the waitpid of this
  171. // context. This means that if pgid is changed (as it is in ion for example), the pgid message
  172. // in syscall::exit() will not be inserted as the key comparator thinks they're equal as their
  173. // PIDs are. So, we have to call this to clear the waitpid queue to prevent deadlocks.
  174. let _ = syscall::waitpid(new_pid, &mut 0, syscall::WUNTRACED | syscall::WCONTINUED);
  175. Ok(new_pid)
  176. }
  177. #[no_mangle]
  178. unsafe extern "sysv64" fn __relibc_internal_fork_impl(initial_rsp: *mut usize) -> usize {
  179. Error::mux(fork_inner(initial_rsp))
  180. }
  181. #[no_mangle]
  182. unsafe extern "sysv64" fn __relibc_internal_fork_hook(cur_filetable_fd: usize, new_pid_fd: usize) {
  183. let _ = syscall::close(cur_filetable_fd);
  184. let _ = syscall::close(new_pid_fd);
  185. }
  186. #[no_mangle]
  187. core::arch::global_asm!("
  188. .p2align 6
  189. .globl __relibc_internal_fork_wrapper
  190. .type __relibc_internal_fork_wrapper, @function
  191. __relibc_internal_fork_wrapper:
  192. push rbp
  193. mov rbp, rsp
  194. push rbx
  195. push rbp
  196. push r12
  197. push r13
  198. push r14
  199. push r15
  200. sub rsp, 32
  201. stmxcsr [rsp+16]
  202. fnstcw [rsp+24]
  203. mov rdi, rsp
  204. call __relibc_internal_fork_impl
  205. jmp 2f
  206. .size __relibc_internal_fork_wrapper, . - __relibc_internal_fork_wrapper
  207. .p2align 6
  208. .type __relibc_internal_fork_ret, @function
  209. __relibc_internal_fork_ret:
  210. mov rdi, [rsp]
  211. mov rsi, [rsp + 8]
  212. call __relibc_internal_fork_hook
  213. ldmxcsr [rsp+16]
  214. fldcw [rsp+24]
  215. xor rax, rax
  216. .p2align 4
  217. 2:
  218. add rsp, 32
  219. pop r15
  220. pop r14
  221. pop r13
  222. pop r12
  223. pop rbp
  224. pop rbx
  225. pop rbp
  226. ret
  227. .size __relibc_internal_fork_ret, . - __relibc_internal_fork_ret
  228. .globl __relibc_internal_pte_clone_ret
  229. .type __relibc_internal_pte_clone_ret, @function
  230. .p2align 6
  231. __relibc_internal_pte_clone_ret:
  232. # Load registers
  233. pop rax
  234. pop rdi
  235. pop rsi
  236. pop rdx
  237. pop rcx
  238. pop r8
  239. pop r9
  240. sub rsp, 8
  241. mov DWORD PTR [rsp], 0x00001F80
  242. ldmxcsr [rsp]
  243. mov WORD PTR [rsp], 0x031F
  244. fldcw [rsp]
  245. add rsp, 8
  246. # Call entry point
  247. call rax
  248. ret
  249. .size __relibc_internal_pte_clone_ret, . - __relibc_internal_pte_clone_ret
  250. ");
  251. extern "sysv64" {
  252. fn __relibc_internal_fork_wrapper() -> usize;
  253. fn __relibc_internal_fork_ret();
  254. fn __relibc_internal_pte_clone_ret();
  255. }