3 年之前 · 049a5156d7
--- a/src/header/sys_auxv/mod.rs
+++ b/src/header/sys_auxv/mod.rs
@@ -2,33 +2,7 @@
 
				 
			
 
				 use crate::platform::types::*;
			
 
				 
			
 
				-pub const AT_NULL: usize = 0; /* End of vector */
			
 
				-pub const AT_IGNORE: usize = 1; /* Entry should be ignored */
			
 
				-pub const AT_EXECFD: usize = 2; /* File descriptor of program */
			
 
				-pub const AT_PHDR: usize = 3; /* Program headers for program */
			
 
				-pub const AT_PHENT: usize = 4; /* Size of program header entry */
			
 
				-pub const AT_PHNUM: usize = 5; /* Number of program headers */
			
 
				-pub const AT_PAGESZ: usize = 6; /* System page size */
			
 
				-pub const AT_BASE: usize = 7; /* Base address of interpreter */
			
 
				-pub const AT_FLAGS: usize = 8; /* Flags */
			
 
				-pub const AT_ENTRY: usize = 9; /* Entry point of program */
			
 
				-pub const AT_NOTELF: usize = 10; /* Program is not ELF */
			
 
				-pub const AT_UID: usize = 11; /* Real uid */
			
 
				-pub const AT_EUID: usize = 12; /* Effective uid */
			
 
				-pub const AT_GID: usize = 13; /* Real gid */
			
 
				-pub const AT_EGID: usize = 14; /* Effective gid */
			
 
				-pub const AT_CLKTCK: usize = 17; /* Frequency of times() */
			
 
				-pub const AT_PLATFORM: usize = 15; /* String identifying platform.  */
			
 
				-pub const AT_HWCAP: usize = 16; /* Machine-dependent hints about */
			
 
				-pub const AT_FPUCW: usize = 18; /* Used FPU control word.  */
			
 
				-pub const AT_DCACHEBSIZE: usize = 19; /* Data cache block size.  */
			
 
				-pub const AT_ICACHEBSIZE: usize = 20; /* Instruction cache block size.  */
			
 
				-pub const AT_UCACHEBSIZE: usize = 21; /* Unified cache block size.  */
			
 
				-pub const AT_IGNOREPPC: usize = 22; /* Entry should be ignored.  */
			
 
				-pub const AT_BASE_PLATFORM: usize = 24; /* String identifying real platforms.*/
			
 
				-pub const AT_RANDOM: usize = 25; /* Address of 16 random bytes.  */
			
 
				-pub const AT_HWCAP2: usize = 26; /* More machine-dependent hints about*/
			
 
				-pub const AT_EXECFN: usize = 31; /* Filename of executable.  */
			
 
				+pub use crate::platform::auxv_defs::*;
			
 
				 
			
 
				 #[no_mangle]
			
 
				 pub extern "C" fn getauxval(_t: c_ulong) -> c_ulong {
			
--- a/src/platform/auxv_defs.rs
+++ b/src/platform/auxv_defs.rs
@@ -0,0 +1,27 @@
 
				+pub const AT_NULL: usize = 0; /* End of vector */
			
 
				+pub const AT_IGNORE: usize = 1; /* Entry should be ignored */
			
 
				+pub const AT_EXECFD: usize = 2; /* File descriptor of program */
			
 
				+pub const AT_PHDR: usize = 3; /* Program headers for program */
			
 
				+pub const AT_PHENT: usize = 4; /* Size of program header entry */
			
 
				+pub const AT_PHNUM: usize = 5; /* Number of program headers */
			
 
				+pub const AT_PAGESZ: usize = 6; /* System page size */
			
 
				+pub const AT_BASE: usize = 7; /* Base address of interpreter */
			
 
				+pub const AT_FLAGS: usize = 8; /* Flags */
			
 
				+pub const AT_ENTRY: usize = 9; /* Entry point of program */
			
 
				+pub const AT_NOTELF: usize = 10; /* Program is not ELF */
			
 
				+pub const AT_UID: usize = 11; /* Real uid */
			
 
				+pub const AT_EUID: usize = 12; /* Effective uid */
			
 
				+pub const AT_GID: usize = 13; /* Real gid */
			
 
				+pub const AT_EGID: usize = 14; /* Effective gid */
			
 
				+pub const AT_CLKTCK: usize = 17; /* Frequency of times() */
			
 
				+pub const AT_PLATFORM: usize = 15; /* String identifying platform.  */
			
 
				+pub const AT_HWCAP: usize = 16; /* Machine-dependent hints about */
			
 
				+pub const AT_FPUCW: usize = 18; /* Used FPU control word.  */
			
 
				+pub const AT_DCACHEBSIZE: usize = 19; /* Data cache block size.  */
			
 
				+pub const AT_ICACHEBSIZE: usize = 20; /* Instruction cache block size.  */
			
 
				+pub const AT_UCACHEBSIZE: usize = 21; /* Unified cache block size.  */
			
 
				+pub const AT_IGNOREPPC: usize = 22; /* Entry should be ignored.  */
			
 
				+pub const AT_BASE_PLATFORM: usize = 24; /* String identifying real platforms.*/
			
 
				+pub const AT_RANDOM: usize = 25; /* Address of 16 random bytes.  */
			
 
				+pub const AT_HWCAP2: usize = 26; /* More machine-dependent hints about*/
			
 
				+pub const AT_EXECFN: usize = 31; /* Filename of executable.  */
			
--- a/src/platform/mod.rs
+++ b/src/platform/mod.rs
@@ -34,6 +34,12 @@ mod pte;
 
				 pub use self::rlb::{Line, RawLineBuffer};
			
 
				 pub mod rlb;
			
 
				 
			
 
				+#[cfg(target_os = "linux")]
			
 
				+pub mod auxv_defs;
			
 
				+
			
 
				+#[cfg(target_os = "redox")]
			
 
				+pub use redox_exec::auxv_defs;
			
 
				+
			
 
				 use self::types::*;
			
 
				 pub mod types;
			
 
				 
			
--- a/src/platform/redox/exec.rs
+++ b/src/platform/redox/exec.rs
@@ -1,20 +1,245 @@
 
				+use crate::c_str::{CStr, CString};
			
 
				+use crate::core_io::{BufReader, prelude::*, SeekFrom};
			
 
				 use crate::fs::File;
			
 
				+use crate::header::{fcntl, string::strlen};
			
 
				+use crate::platform::{sys::{S_ISUID, S_ISGID}, types::*};
			
 
				 
			
 
				-use syscall::error::Result;
			
 
				-use redox_exec::FdGuard;
			
 
				+use syscall::data::Stat;
			
 
				+use syscall::flag::*;
			
 
				+use syscall::error::*;
			
 
				+use redox_exec::{FdGuard, FexecResult};
			
 
				 
			
 
				-pub fn fexec_impl(file: File, path: &[u8], args: &[&[u8]], envs: &[&[u8]], args_envs_size_without_nul: usize) -> Result<usize> {
			
 
				+fn fexec_impl(file: File, path: &[u8], args: &[&[u8]], envs: &[&[u8]], total_args_envs_size: usize, interp_override: Option<redox_exec::InterpOverride>) -> Result<usize> {
			
 
				     let fd = *file;
			
 
				     core::mem::forget(file);
			
 
				     let image_file = FdGuard::new(fd as usize);
			
 
				 
			
 
				     let open_via_dup = FdGuard::new(syscall::open("thisproc:current/open_via_dup", 0)?);
			
 
				 
			
 
				-    let total_args_envs_size = args_envs_size_without_nul + args.len() + envs.len();
			
 
				-    let addrspace_selection_fd = redox_exec::fexec_impl(image_file, open_via_dup, path, args.iter().rev(), envs.iter().rev(), total_args_envs_size)?;
			
 
				+    let addrspace_selection_fd = match redox_exec::fexec_impl(image_file, open_via_dup, path, args.iter().rev(), envs.iter().rev(), total_args_envs_size, interp_override)? {
			
 
				+        FexecResult::Normal { addrspace_handle } => addrspace_handle,
			
 
				+        FexecResult::Interp { image_file, open_via_dup, path, interp_override: new_interp_override } => {
			
 
				+            drop(image_file);
			
 
				+            drop(open_via_dup);
			
 
				+
			
 
				+            // According to elf(5), PT_INTERP requires that the interpreter path be
			
 
				+            // null-terminated. Violating this should therefore give the "format error" ENOEXEC.
			
 
				+            let path_cstr = CStr::from_bytes_with_nul(&path).map_err(|_| Error::new(ENOEXEC))?;
			
 
				+
			
 
				+            return execve(path_cstr, ArgEnv::Parsed { total_args_envs_size, args, envs }, Some(new_interp_override));
			
 
				+        }
			
 
				+    };
			
 
				 
			
 
				     // Dropping this FD will cause the address space switch.
			
 
				     drop(addrspace_selection_fd);
			
 
				 
			
 
				     unreachable!();
			
 
				 }
			
 
				+pub enum ArgEnv<'a> {
			
 
				+    C { argv: *const *mut c_char, envp: *const *mut c_char },
			
 
				+    Parsed { args: &'a [&'a [u8]], envs: &'a [&'a [u8]], total_args_envs_size: usize },
			
 
				+}
			
 
				+pub fn execve(path: &CStr, arg_env: ArgEnv, interp_override: Option<redox_exec::InterpOverride>) -> Result<usize> {
			
 
				+    // NOTE: We must omit O_CLOEXEC and close manually, otherwise it will be closed before we
			
 
				+    // have even read it!
			
 
				+    let mut image_file = File::open(path, O_RDONLY as c_int).map_err(|_| Error::new(ENOENT))?;
			
 
				+
			
 
				+    // With execve now being implemented in userspace, we need to check ourselves that this
			
 
				+    // file is actually executable. While checking for read permission is unnecessary as the
			
 
				+    // scheme will not allow us to read otherwise, the execute bit is completely unenforced. We
			
 
				+    // have the permission to mmap executable memory and fill it with the program even if it is
			
 
				+    // unset, so the best we can do is check that nothing is executed by accident.
			
 
				+    //
			
 
				+    // TODO: At some point we might have capabilities limiting the ability to allocate
			
 
				+    // executable memory, and in that case we might use the `escalate:` scheme as we already do
			
 
				+    // when the binary needs setuid/setgid.
			
 
				+
			
 
				+    let mut stat = Stat::default();
			
 
				+    syscall::fstat(*image_file as usize, &mut stat)?;
			
 
				+    let uid = syscall::getuid()?;
			
 
				+    let gid = syscall::getuid()?;
			
 
				+
			
 
				+    let mode = if uid == stat.st_uid as usize {
			
 
				+        (stat.st_mode >> 3 * 2) & 0o7
			
 
				+    } else if gid == stat.st_gid as usize {
			
 
				+        (stat.st_mode >> 3 * 1) & 0o7
			
 
				+    } else {
			
 
				+        stat.st_mode & 0o7
			
 
				+    };
			
 
				+
			
 
				+    if mode & 0o1 == 0o0 {
			
 
				+        return Err(Error::new(EPERM));
			
 
				+    }
			
 
				+    let wants_setugid = stat.st_mode & ((S_ISUID | S_ISGID) as u16) != 0;
			
 
				+
			
 
				+    // Count arguments
			
 
				+    let mut len = 0;
			
 
				+
			
 
				+    match arg_env {
			
 
				+        ArgEnv::C { argv, .. } => unsafe {
			
 
				+            while !(*argv.add(len)).is_null() {
			
 
				+                len += 1;
			
 
				+            }
			
 
				+        }
			
 
				+        ArgEnv::Parsed { args, .. } => len = args.len(),
			
 
				+    }
			
 
				+
			
 
				+    let mut args: Vec<&[u8]> = Vec::with_capacity(len);
			
 
				+
			
 
				+    // Read shebang (for example #!/bin/sh)
			
 
				+    let mut _interpreter_path = None;
			
 
				+    let is_interpreted = {
			
 
				+        let mut read = 0;
			
 
				+        let mut shebang = [0; 2];
			
 
				+
			
 
				+        while read < 2 {
			
 
				+            match image_file.read(&mut shebang).map_err(|_| Error::new(ENOEXEC))? {
			
 
				+                0 => break,
			
 
				+                i => read += i,
			
 
				+            }
			
 
				+        }
			
 
				+        shebang == *b"#!"
			
 
				+    };
			
 
				+    // Since the fexec implementation is almost fully done in userspace, the kernel can no
			
 
				+    // longer set UID/GID accordingly, and this code checking for them before using
			
 
				+    // hypothetical interfaces to upgrade UID/GID, can not be trusted. So we ask the
			
 
				+    // `escalate:` scheme for help. Note that `escalate:` can be deliberately excluded from the
			
 
				+    // scheme namespace to deny privilege escalation (such as su/sudo/doas) for untrusted
			
 
				+    // processes.
			
 
				+    //
			
 
				+    // According to execve(2), Linux and most other UNIXes ignore setuid/setgid for interpreted
			
 
				+    // executables and thereby simply keep the privileges as is. For compatibility we do that
			
 
				+    // too.
			
 
				+
			
 
				+    if is_interpreted {
			
 
				+        // TODO: Does this support prepending args to the interpreter? E.g.
			
 
				+        // #!/usr/bin/env python3
			
 
				+
			
 
				+        // So, this file is interpreted.
			
 
				+        // Then, read the actual interpreter:
			
 
				+        let mut interpreter = Vec::new();
			
 
				+        BufReader::new(&mut image_file).read_until(b'\n', &mut interpreter).map_err(|_| Error::new(EIO))?;
			
 
				+        if interpreter.ends_with(&[b'\n']) {
			
 
				+            interpreter.pop().unwrap();
			
 
				+        }
			
 
				+        let cstring = CString::new(interpreter).map_err(|_| Error::new(ENOEXEC))?;
			
 
				+        image_file = File::open(&cstring, O_RDONLY as c_int).map_err(|_| Error::new(ENOENT))?;
			
 
				+
			
 
				+        // Make sure path is kept alive long enough, and push it to the arguments
			
 
				+        _interpreter_path = Some(cstring);
			
 
				+        let path_ref = _interpreter_path.as_ref().unwrap();
			
 
				+        args.push(path_ref.as_bytes());
			
 
				+    } else {
			
 
				+        image_file.seek(SeekFrom::Start(0)).map_err(|_| Error::new(EIO))?;
			
 
				+    }
			
 
				+
			
 
				+    let (total_args_envs_size, args, envs): (usize, Vec<_>, Vec<_>) = match arg_env {
			
 
				+        ArgEnv::C { mut argv, mut envp } => unsafe {
			
 
				+            let mut args_envs_size_without_nul = 0;
			
 
				+
			
 
				+            // Arguments
			
 
				+            while !argv.read().is_null() {
			
 
				+                let arg = argv.read();
			
 
				+
			
 
				+                let len = strlen(arg);
			
 
				+                args.push(core::slice::from_raw_parts(arg as *const u8, len));
			
 
				+                args_envs_size_without_nul += len;
			
 
				+                argv = argv.add(1);
			
 
				+            }
			
 
				+
			
 
				+            // Environment variables
			
 
				+            let mut len = 0;
			
 
				+            while !envp.add(len).read().is_null() {
			
 
				+                len += 1;
			
 
				+            }
			
 
				+
			
 
				+            let mut envs: Vec<&[u8]> = Vec::with_capacity(len);
			
 
				+            while !envp.read().is_null() {
			
 
				+                let env = envp.read();
			
 
				+
			
 
				+                let len = strlen(env);
			
 
				+                envs.push(core::slice::from_raw_parts(env as *const u8, len));
			
 
				+                args_envs_size_without_nul += len;
			
 
				+                envp = envp.add(1);
			
 
				+            }
			
 
				+            (args_envs_size_without_nul + args.len() + envs.len(), args, envs)
			
 
				+        }
			
 
				+        ArgEnv::Parsed { args: new_args, envs, total_args_envs_size } => {
			
 
				+            let prev_size: usize = args.iter().map(|a| a.len()).sum();
			
 
				+            args.extend(new_args);
			
 
				+            (total_args_envs_size + prev_size, args, Vec::from(envs))
			
 
				+        }
			
 
				+    };
			
 
				+
			
 
				+
			
 
				+    // Close all O_CLOEXEC file descriptors. TODO: close_range?
			
 
				+    {
			
 
				+        // NOTE: This approach of implementing O_CLOEXEC will not work in multithreaded
			
 
				+        // scenarios. While execve() is undefined according to POSIX if there exist sibling
			
 
				+        // threads, it could still be allowed by keeping certain file descriptors and instead
			
 
				+        // set the active file table.
			
 
				+        let files_fd = File::new(syscall::open("thisproc:current/filetable", O_RDONLY)? as c_int);
			
 
				+        for line in BufReader::new(files_fd).lines() {
			
 
				+            let line = match line {
			
 
				+                Ok(l) => l,
			
 
				+                Err(_) => break,
			
 
				+            };
			
 
				+            let fd = match line.parse::<usize>() {
			
 
				+                Ok(f) => f,
			
 
				+                Err(_) => continue,
			
 
				+            };
			
 
				+
			
 
				+            let flags = syscall::fcntl(fd, F_GETFD, 0)?;
			
 
				+
			
 
				+            if flags & O_CLOEXEC == O_CLOEXEC {
			
 
				+                let _ = syscall::close(fd);
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    if !is_interpreted && wants_setugid {
			
 
				+        // Make sure the last file descriptor not covered by O_CLOEXEC is not leaked.
			
 
				+        drop(image_file);
			
 
				+
			
 
				+        // We are now going to invoke `escalate:` rather than loading the program ourselves.
			
 
				+        let escalate_fd = FdGuard::new(syscall::open("escalate:", O_WRONLY)?);
			
 
				+
			
 
				+        // First, we write the path.
			
 
				+        //
			
 
				+        // TODO: For improved security, use a hypothetical SYS_DUP_FORWARD syscall to give the
			
 
				+        // scheme our file descriptor. It can check through the kernel-overwritten stat.st_dev
			
 
				+        // field that it pertains to a "trusted" scheme (i.e. of at least the privilege the
			
 
				+        // new uid/gid has), although for now only root can open schemes. Passing a file
			
 
				+        // descriptor and not a path will allow escalated to run in a limited namespace.
			
 
				+        //
			
 
				+        // TODO: Plus, at this point fexecve is not implemented (but specified in
			
 
				+        // POSIX.1-2008), and to avoid bad syscalls such as fpath, passing a file descriptor
			
 
				+        // would be better.
			
 
				+        let _ = syscall::write(*escalate_fd, path.to_bytes());
			
 
				+
			
 
				+        // Second, we write the flattened args and envs with NUL characters separating
			
 
				+        // individual items. This can be copied directly into the new executable's memory.
			
 
				+        let _ = syscall::write(*escalate_fd, &flatten_with_nul(args))?;
			
 
				+        let _ = syscall::write(*escalate_fd, &flatten_with_nul(envs))?;
			
 
				+
			
 
				+        // Closing will notify the scheme, and from that point we will no longer have control
			
 
				+        // over this process (unless it fails). We do this manually since drop cannot handle
			
 
				+        // errors.
			
 
				+        let fd = *escalate_fd as usize;
			
 
				+        core::mem::forget(escalate_fd);
			
 
				+
			
 
				+        syscall::close(fd)?;
			
 
				+
			
 
				+        unreachable!()
			
 
				+    } else {
			
 
				+        fexec_impl(image_file, path.to_bytes(), &args, &envs, total_args_envs_size, interp_override)
			
 
				+    }
			
 
				+}
			
 
				+fn flatten_with_nul<T>(iter: impl IntoIterator<Item = T>) -> Box<[u8]> where T: AsRef<[u8]> {
			
 
				+    let mut vec = Vec::new();
			
 
				+    for item in iter {
			
 
				+        vec.extend(item.as_ref());
			
 
				+        vec.push(b'\0');
			
 
				+    }
			
 
				+    vec.into_boxed_slice()
			
 
				+}
			
--- a/src/platform/redox/mod.rs
+++ b/src/platform/redox/mod.rs
@@ -34,6 +34,11 @@ use super::{errno, types::*, Pal, Read};
 
				 static mut BRK_CUR: *mut c_void = ptr::null_mut();
			
 
				 static mut BRK_END: *mut c_void = ptr::null_mut();
			
 
				 
			
 
				+const PAGE_SIZE: usize = 4096;
			
 
				+fn round_up_to_page_size(val: usize) -> usize {
			
 
				+    (val + PAGE_SIZE - 1) / PAGE_SIZE * PAGE_SIZE
			
 
				+}
			
 
				+
			
 
				 mod clone;
			
 
				 mod epoll;
			
 
				 mod exec;
			
@@ -67,14 +72,6 @@ pub fn e(sys: Result<usize>) -> usize {
 
				         }
			
 
				     }
			
 
				 }
			
 
				-fn flatten_with_nul<T>(iter: impl IntoIterator<Item = T>) -> Box<[u8]> where T: AsRef<[u8]> {
			
 
				-    let mut vec = Vec::new();
			
 
				-    for item in iter {
			
 
				-        vec.extend(item.as_ref());
			
 
				-        vec.push(b'\0');
			
 
				-    }
			
 
				-    vec.into_boxed_slice()
			
 
				-}
			
 
				 
			
 
				 pub struct Sys;
			
 
				 
			
@@ -216,223 +213,10 @@ impl Pal for Sys {
 
				 
			
 
				     unsafe fn execve(
			
 
				         path: &CStr,
			
 
				-        mut argv: *const *mut c_char,
			
 
				-        mut envp: *const *mut c_char,
			
 
				+        argv: *const *mut c_char,
			
 
				+        envp: *const *mut c_char,
			
 
				     ) -> c_int {
			
 
				-        // NOTE: We must omit O_CLOEXEC and close manually, otherwise it will be closed before we
			
 
				-        // have even read it!
			
 
				-        let mut file = match File::open(path, fcntl::O_RDONLY) {
			
 
				-            Ok(file) => file,
			
 
				-            Err(_) => return -1,
			
 
				-        };
			
 
				-        let fd = *file as usize;
			
 
				-
			
 
				-        // With execve now being implemented in userspace, we need to check ourselves that this
			
 
				-        // file is actually executable. While checking for read permission is unnecessary as the
			
 
				-        // scheme will not allow us to read otherwise, the execute bit is completely unenforced. We
			
 
				-        // have the permission to mmap executable memory and fill it with the program even if it is
			
 
				-        // unset, so the best we can do is check that nothing is executed by accident.
			
 
				-        //
			
 
				-        // TODO: At some point we might have capabilities limiting the ability to allocate
			
 
				-        // executable memory, and in that case we might use the `escalate:` scheme as we already do
			
 
				-        // when the binary needs setuid/setgid.
			
 
				-
			
 
				-        let mut stat = redox_stat::default();
			
 
				-        if e(syscall::fstat(fd, &mut stat)) == !0 {
			
 
				-            return -1;
			
 
				-        }
			
 
				-        let uid = e(syscall::getuid());
			
 
				-        if uid == !0 {
			
 
				-            return -1;
			
 
				-        }
			
 
				-        let gid = e(syscall::getuid());
			
 
				-        if gid == !0 {
			
 
				-            return -1;
			
 
				-        }
			
 
				-
			
 
				-        let mode = if uid == stat.st_uid as usize {
			
 
				-            (stat.st_mode >> 3 * 2) & 0o7
			
 
				-        } else if gid == stat.st_gid as usize {
			
 
				-            (stat.st_mode >> 3 * 1) & 0o7
			
 
				-        } else {
			
 
				-            stat.st_mode & 0o7
			
 
				-        };
			
 
				-
			
 
				-        if mode & 0o1 == 0o0 {
			
 
				-            errno = EPERM;
			
 
				-            return -1;
			
 
				-        }
			
 
				-        let wants_setugid = stat.st_mode & ((S_ISUID | S_ISGID) as u16) != 0;
			
 
				-
			
 
				-        // Count arguments
			
 
				-        let mut len = 0;
			
 
				-        while !(*argv.add(len)).is_null() {
			
 
				-            len += 1;
			
 
				-        }
			
 
				-
			
 
				-        let mut args: Vec<&[u8]> = Vec::with_capacity(len);
			
 
				-
			
 
				-        // Read shebang (for example #!/bin/sh)
			
 
				-        let mut _interpreter_path = None;
			
 
				-        let is_interpreted = {
			
 
				-            let mut read = 0;
			
 
				-            let mut shebang = [0; 2];
			
 
				-
			
 
				-            while read < 2 {
			
 
				-                match file.read(&mut shebang) {
			
 
				-                    Ok(0) => break,
			
 
				-                    Ok(i) => read += i,
			
 
				-                    Err(_) => return -1,
			
 
				-                }
			
 
				-            }
			
 
				-            shebang == *b"#!"
			
 
				-        };
			
 
				-        // Since the fexec implementation is almost fully done in userspace, the kernel can no
			
 
				-        // longer set UID/GID accordingly, and this code checking for them before using
			
 
				-        // hypothetical interfaces to upgrade UID/GID, can not be trusted. So we ask the
			
 
				-        // `escalate:` scheme for help. Note that `escalate:` can be deliberately excluded from the
			
 
				-        // scheme namespace to deny privilege escalation (such as su/sudo/doas) for untrusted
			
 
				-        // processes.
			
 
				-        //
			
 
				-        // According to execve(2), Linux and most other UNIXes ignore setuid/setgid for interpreted
			
 
				-        // executables and thereby simply keep the privileges as is. For compatibility we do that
			
 
				-        // too.
			
 
				-
			
 
				-        if is_interpreted {
			
 
				-            // So, this file is interpreted.
			
 
				-            // Then, read the actual interpreter:
			
 
				-            let mut interpreter = Vec::new();
			
 
				-            if BufReader::new(&mut file).read_until(b'\n', &mut interpreter).is_err() {
			
 
				-                return -1;
			
 
				-            }
			
 
				-            if interpreter.ends_with(&[b'\n']) {
			
 
				-                interpreter.pop().unwrap();
			
 
				-            }
			
 
				-            let cstring = match CString::new(interpreter) {
			
 
				-                Ok(cstring) => cstring,
			
 
				-                Err(_) => return -1,
			
 
				-            };
			
 
				-            file = match File::open(&cstring, fcntl::O_RDONLY) {
			
 
				-                Ok(file) => file,
			
 
				-                Err(_) => return -1,
			
 
				-            };
			
 
				-
			
 
				-            // Make sure path is kept alive long enough, and push it to the arguments
			
 
				-            _interpreter_path = Some(cstring);
			
 
				-            let path_ref = _interpreter_path.as_ref().unwrap();
			
 
				-            args.push(path_ref.as_bytes());
			
 
				-        } else {
			
 
				-            if file.seek(SeekFrom::Start(0)).is_err() {
			
 
				-                return -1;
			
 
				-            }
			
 
				-        }
			
 
				-        let mut args_envs_size_without_nul = 0;
			
 
				-
			
 
				-        // Arguments
			
 
				-        while !argv.read().is_null() {
			
 
				-            let arg = argv.read();
			
 
				-
			
 
				-            let len = strlen(arg);
			
 
				-            args.push(core::slice::from_raw_parts(arg as *const u8, len));
			
 
				-            args_envs_size_without_nul += len;
			
 
				-            argv = argv.add(1);
			
 
				-        }
			
 
				-
			
 
				-        // Environment variables
			
 
				-        let mut len = 0;
			
 
				-        while !envp.add(len).read().is_null() {
			
 
				-            len += 1;
			
 
				-        }
			
 
				-
			
 
				-        let mut envs: Vec<&[u8]> = Vec::with_capacity(len);
			
 
				-        while !envp.read().is_null() {
			
 
				-            let env = envp.read();
			
 
				-
			
 
				-            let len = strlen(env);
			
 
				-            envs.push(core::slice::from_raw_parts(env as *const u8, len));
			
 
				-            args_envs_size_without_nul += len;
			
 
				-            envp = envp.add(1);
			
 
				-        }
			
 
				-
			
 
				-        // Close all O_CLOEXEC file descriptors. TODO: close_range?
			
 
				-        {
			
 
				-            // NOTE: This approach of implementing O_CLOEXEC will not work in multithreaded
			
 
				-            // scenarios. While execve() is undefined according to POSIX if there exist sibling
			
 
				-            // threads, it could still be allowed by keeping certain file descriptors and instead
			
 
				-            // set the active file table.
			
 
				-            let name = CStr::from_bytes_with_nul(b"thisproc:current/filetable\0").expect("string should be valid");
			
 
				-            let files_fd = match File::open(name, fcntl::O_RDONLY) {
			
 
				-                Ok(f) => f,
			
 
				-                Err(_) => return -1,
			
 
				-            };
			
 
				-            for line in BufReader::new(files_fd).lines() {
			
 
				-                let line = match line {
			
 
				-                    Ok(l) => l,
			
 
				-                    Err(_) => break,
			
 
				-                };
			
 
				-                let fd = match line.parse::<usize>() {
			
 
				-                    Ok(f) => f,
			
 
				-                    Err(_) => continue,
			
 
				-                };
			
 
				-
			
 
				-                let flags = Self::fcntl(fd as c_int, fcntl::F_GETFD, 0);
			
 
				-                if flags != -1 {
			
 
				-                    if flags & fcntl::O_CLOEXEC == fcntl::O_CLOEXEC {
			
 
				-                        let _ = Self::close(fd as c_int);
			
 
				-                    }
			
 
				-                }
			
 
				-            }
			
 
				-        }
			
 
				-
			
 
				-        if !is_interpreted && wants_setugid {
			
 
				-            // Make sure the last file descriptor not covered by O_CLOEXEC is not leaked.
			
 
				-            drop(file);
			
 
				-
			
 
				-            let name = CStr::from_bytes_with_nul(b"escalate:\0").expect("string should be valid");
			
 
				-            // We are now going to invoke `escalate:` rather than loading the program ourselves.
			
 
				-            let mut escalate_fd = match File::open(name, fcntl::O_WRONLY) {
			
 
				-                Ok(f) => f,
			
 
				-                Err(_) => return -1,
			
 
				-            };
			
 
				-
			
 
				-            // First, we write the path.
			
 
				-            //
			
 
				-            // TODO: For improved security, use a hypothetical SYS_DUP_FORWARD syscall to give the
			
 
				-            // scheme our file descriptor. It can check through the kernel-overwritten stat.st_dev
			
 
				-            // field that it pertains to a "trusted" scheme (i.e. of at least the privilege the
			
 
				-            // new uid/gid has), although for now only root can open schemes. Passing a file
			
 
				-            // descriptor and not a path will allow escalated to run in a limited namespace.
			
 
				-            //
			
 
				-            // TODO: Plus, at this point fexecve is not implemented (but specified in
			
 
				-            // POSIX.1-2008), and to avoid bad syscalls such as fpath, passing a file descriptor
			
 
				-            // would be better.
			
 
				-            if escalate_fd.write_all(path.to_bytes()).is_err() {
			
 
				-                return -1;
			
 
				-            }
			
 
				-
			
 
				-            // Second, we write the flattened args and envs with NUL characters separating
			
 
				-            // individual items. This can be copied directly into the new executable's memory.
			
 
				-            if escalate_fd.write_all(&flatten_with_nul(args)).is_err() {
			
 
				-                return -1;
			
 
				-            }
			
 
				-            if escalate_fd.write_all(&flatten_with_nul(envs)).is_err() {
			
 
				-                return -1;
			
 
				-            }
			
 
				-
			
 
				-            // Closing will notify the scheme, and from that point we will no longer have control
			
 
				-            // over this process (unless it fails). We do this manually since drop cannot handle
			
 
				-            // errors.
			
 
				-            let fd = *escalate_fd as usize;
			
 
				-            core::mem::forget(escalate_fd);
			
 
				-
			
 
				-            if let Err(err) = syscall::close(fd) {
			
 
				-                return e(Err(err)) as c_int;
			
 
				-            }
			
 
				-
			
 
				-            unreachable!()
			
 
				-        } else {
			
 
				-            e(self::exec::fexec_impl(file, path.to_bytes(), &args, &envs, args_envs_size_without_nul)) as c_int
			
 
				-        }
			
 
				+        e(self::exec::execve(path, self::exec::ArgEnv::C { argv, envp }, None)) as c_int
			
 
				     }
			
 
				 
			
 
				     fn fchdir(fd: c_int) -> c_int {
			
@@ -701,7 +485,7 @@ impl Pal for Sys {
 
				     }
			
 
				 
			
 
				     fn getpagesize() -> usize {
			
 
				-        4096
			
 
				+        PAGE_SIZE
			
 
				     }
			
 
				 
			
 
				     fn getpgid(pid: pid_t) -> pid_t {
			
@@ -848,7 +632,7 @@ impl Pal for Sys {
 
				     ) -> *mut c_void {
			
 
				         let map = Map {
			
 
				             offset: off as usize,
			
 
				-            size: len,
			
 
				+            size: round_up_to_page_size(len),
			
 
				             flags: syscall::MapFlags::from_bits_truncate(
			
 
				                 ((prot as usize) << 16) | ((flags as usize) & 0xFFFF),
			
 
				             ),
			
@@ -865,7 +649,7 @@ impl Pal for Sys {
 
				     unsafe fn mprotect(addr: *mut c_void, len: usize, prot: c_int) -> c_int {
			
 
				         e(syscall::mprotect(
			
 
				             addr as usize,
			
 
				-            len,
			
 
				+            round_up_to_page_size(len),
			
 
				             syscall::MapFlags::from_bits((prot as usize) << 16)
			
 
				                 .expect("mprotect: invalid bit pattern"),
			
 
				         )) as c_int
			
@@ -877,7 +661,7 @@ impl Pal for Sys {
 
				         /* TODO
			
 
				         e(syscall::msync(
			
 
				             addr as usize,
			
 
				-            len,
			
 
				+            round_up_to_page_size(len),
			
 
				             flags
			
 
				         )) as c_int
			
 
				         */
			
@@ -894,7 +678,7 @@ impl Pal for Sys {
 
				     }
			
 
				 
			
 
				     unsafe fn munmap(addr: *mut c_void, len: usize) -> c_int {
			
 
				-        if e(syscall::funmap(addr as usize, len)) == !0 {
			
 
				+        if e(syscall::funmap(addr as usize, round_up_to_page_size(len))) == !0 {
			
 
				             return !0;
			
 
				         }
			
 
				         0
			
--- a/src/platform/redox/redox-exec/src/lib.rs
+++ b/src/platform/redox/redox-exec/src/lib.rs
@@ -1,34 +1,46 @@
 
				 #![no_std]
			
 
				 
			
 
				-#![feature(array_chunks)]
			
 
				+#![feature(array_chunks, map_first_last)]
			
 
				 
			
 
				 extern crate alloc;
			
 
				 
			
 
				-use core::convert::TryFrom;
			
 
				 use core::mem::size_of;
			
 
				 
			
 
				 use alloc::{
			
 
				-    collections::{btree_map::Entry, BTreeMap},
			
 
				-    vec::Vec,
			
 
				+    boxed::Box,
			
 
				+    collections::BTreeMap,
			
 
				     vec,
			
 
				 };
			
 
				 
			
 
				 use syscall::{
			
 
				     error::*,
			
 
				-    flag::{AT_ENTRY, AT_NULL, AT_PHDR, AT_PHENT, AT_PHNUM, MapFlags, O_WRONLY, SEEK_SET},
			
 
				+    flag::{MapFlags, SEEK_SET},
			
 
				 };
			
 
				 
			
 
				 #[cfg(target_arch = "x86_64")]
			
 
				 const PAGE_SIZE: usize = 4096;
			
 
				 
			
 
				-pub fn fexec_impl<A, E>(image_file: FdGuard, open_via_dup: FdGuard, path: &[u8], args: A, envs: E, total_args_envs_size: usize) -> Result<FdGuard>
			
 
				+pub enum FexecResult {
			
 
				+    Normal { addrspace_handle: FdGuard },
			
 
				+    Interp { path: Box<[u8]>, image_file: FdGuard, open_via_dup: FdGuard, interp_override: InterpOverride },
			
 
				+}
			
 
				+pub struct InterpOverride {
			
 
				+    phs: Box<[u8]>,
			
 
				+    at_entry: usize,
			
 
				+    at_phnum: usize,
			
 
				+    at_phent: usize,
			
 
				+    name: Box<[u8]>,
			
 
				+    tree: BTreeMap<usize, usize>,
			
 
				+}
			
 
				+
			
 
				+pub fn fexec_impl<A, E>(image_file: FdGuard, open_via_dup: FdGuard, path: &[u8], args: A, envs: E, total_args_envs_size: usize, mut interp_override: Option<InterpOverride>) -> Result<FexecResult>
			
 
				 where
			
 
				     A: IntoIterator,
			
 
				     E: IntoIterator,
			
 
				     A::Item: AsRef<[u8]>,
			
 
				     E::Item: AsRef<[u8]>,
			
 
				 {
			
 
				-    use goblin::elf64::{header::Header, program_header::program_header64::{ProgramHeader, PT_LOAD, PF_W, PF_X}};
			
 
				+    use goblin::elf64::{header::Header, program_header::program_header64::{ProgramHeader, PT_LOAD, PT_INTERP, PF_W, PF_X}};
			
 
				 
			
 
				     // Here, we do the minimum part of loading an application, which is what the kernel used to do.
			
 
				     // We load the executable into memory (albeit at different offsets in this executable), fix
			
@@ -49,20 +61,24 @@ where
 
				     const MAX_PH_SIZE: usize = 1024 * 1024;
			
 
				     let phentsize = u64::from(header.e_phentsize) as usize;
			
 
				     let phnum = u64::from(header.e_phnum) as usize;
			
 
				-    let pheaders_size = phentsize.saturating_mul(phnum);
			
 
				+    let pheaders_size = phentsize.saturating_mul(phnum).saturating_add(size_of::<Header>());
			
 
				 
			
 
				     if pheaders_size > MAX_PH_SIZE {
			
 
				         return Err(Error::new(E2BIG));
			
 
				     }
			
 
				-    let mut phs = vec! [0_u8; pheaders_size];
			
 
				+    let mut phs_raw = vec! [0_u8; pheaders_size];
			
 
				+    phs_raw[..size_of::<Header>()].copy_from_slice(&header_bytes);
			
 
				+    let phs = &mut phs_raw[size_of::<Header>()..];
			
 
				 
			
 
				-    let mut tree = BTreeMap::new();
			
 
				-    tree.insert(0, PAGE_SIZE);
			
 
				+    // TODO: Remove clone, but this would require more as_refs and as_muts
			
 
				+    let mut tree = interp_override.as_mut().map_or_else(|| {
			
 
				+        core::iter::once((0, PAGE_SIZE)).collect::<BTreeMap<_, _>>()
			
 
				+    }, |o| core::mem::take(&mut o.tree));
			
 
				 
			
 
				-    const BUFSZ: usize = 65536;
			
 
				+    const BUFSZ: usize = 1024 * 256;
			
 
				     let mut buf = vec! [0_u8; BUFSZ];
			
 
				 
			
 
				-    read_all(*image_file as usize, Some(header.e_phoff), &mut phs).map_err(|_| Error::new(EIO))?;
			
 
				+    read_all(*image_file as usize, Some(header.e_phoff), phs).map_err(|_| Error::new(EIO))?;
			
 
				 
			
 
				     for ph_idx in 0..phnum {
			
 
				         let ph_bytes = &phs[ph_idx * phentsize..(ph_idx + 1) * phentsize];
			
@@ -84,19 +100,42 @@ where
 
				         if segment.p_filesz > segment.p_memsz {
			
 
				             return Err(Error::new(ENOEXEC));
			
 
				         }
			
 
				-        if segment.p_type == PT_LOAD {
			
 
				-            mprotect_remote(*grants_fd, vaddr, size, flags)?;
			
 
				-            syscall::lseek(*image_file as usize, segment.p_offset as isize, SEEK_SET).map_err(|_| Error::new(EIO))?;
			
 
				-            syscall::lseek(*memory_fd, segment.p_vaddr as isize, SEEK_SET).map_err(|_| Error::new(EIO))?;
			
 
				-
			
 
				-            for size in core::iter::repeat(buf.len()).take((segment.p_filesz as usize) / buf.len()).chain(Some((segment.p_filesz as usize) % buf.len())) {
			
 
				-                read_all(*image_file as usize, None, &mut buf[..size]).map_err(|_| Error::new(EIO))?;
			
 
				-                let _ = syscall::write(*memory_fd, &buf[..size]).map_err(|_| Error::new(EIO))?;
			
 
				+        #[forbid(unreachable_patterns)]
			
 
				+        match segment.p_type {
			
 
				+            // PT_INTERP must come before any PT_LOAD, so we don't have to iterate twice.
			
 
				+            PT_INTERP => {
			
 
				+                let mut interp = vec! [0_u8; segment.p_filesz as usize];
			
 
				+                read_all(*image_file as usize, Some(segment.p_offset), &mut interp)?;
			
 
				+
			
 
				+                return Ok(FexecResult::Interp {
			
 
				+                    path: interp.into_boxed_slice(),
			
 
				+                    image_file,
			
 
				+                    open_via_dup,
			
 
				+                    interp_override: InterpOverride {
			
 
				+                        at_entry: header.e_entry as usize,
			
 
				+                        at_phnum: phnum,
			
 
				+                        at_phent: phentsize,
			
 
				+                        phs: phs_raw.into_boxed_slice(),
			
 
				+                        name: path.into(),
			
 
				+                        tree,
			
 
				+                    }
			
 
				+                });
			
 
				             }
			
 
				-
			
 
				-            if !tree.range(..=vaddr).next_back().filter(|(start, size)| **start + **size > vaddr).is_some() {
			
 
				-                tree.insert(vaddr, size);
			
 
				+            PT_LOAD => {
			
 
				+                mprotect_remote(*grants_fd, vaddr, size, flags)?;
			
 
				+                syscall::lseek(*image_file as usize, segment.p_offset as isize, SEEK_SET).map_err(|_| Error::new(EIO))?;
			
 
				+                syscall::lseek(*memory_fd, segment.p_vaddr as isize, SEEK_SET).map_err(|_| Error::new(EIO))?;
			
 
				+
			
 
				+                for size in core::iter::repeat(buf.len()).take((segment.p_filesz as usize) / buf.len()).chain(Some((segment.p_filesz as usize) % buf.len())) {
			
 
				+                    read_all(*image_file as usize, None, &mut buf[..size]).map_err(|_| Error::new(EIO))?;
			
 
				+                    let _ = syscall::write(*memory_fd, &buf[..size]).map_err(|_| Error::new(EIO))?;
			
 
				+                }
			
 
				+
			
 
				+                if !tree.range(..=vaddr).next_back().filter(|(start, size)| **start + **size > vaddr).is_some() {
			
 
				+                    tree.insert(vaddr, size);
			
 
				+                }
			
 
				             }
			
 
				+            _ => continue,
			
 
				         }
			
 
				     }
			
 
				     // Setup a stack starting from the very end of the address space, and then growing downwards.
			
@@ -113,22 +152,31 @@ where
 
				         write_all(*memory_fd, Some(sp as u64), &usize::to_ne_bytes(word))
			
 
				     };
			
 
				 
			
 
				-    let pheaders_size_aligned = (pheaders_size+PAGE_SIZE-1)/PAGE_SIZE*PAGE_SIZE;
			
 
				+    let pheaders_to_convey = if let Some(ref r#override) = interp_override {
			
 
				+        &*r#override.phs
			
 
				+    } else {
			
 
				+        &*phs_raw
			
 
				+    };
			
 
				+    let pheaders_size_aligned = (pheaders_to_convey.len()+PAGE_SIZE-1)/PAGE_SIZE*PAGE_SIZE;
			
 
				     let pheaders = find_free_target_addr(&tree, pheaders_size_aligned).ok_or(Error::new(ENOMEM))?;
			
 
				     tree.insert(pheaders, pheaders_size_aligned);
			
 
				     mprotect_remote(*grants_fd, pheaders, pheaders_size_aligned, MapFlags::PROT_READ)?;
			
 
				 
			
 
				-    write_all(*memory_fd, Some(pheaders as u64), &phs)?;
			
 
				+    write_all(*memory_fd, Some(pheaders as u64), &pheaders_to_convey)?;
			
 
				 
			
 
				     push(0)?;
			
 
				     push(AT_NULL)?;
			
 
				     push(header.e_entry as usize)?;
			
 
				+    if let Some(ref r#override) = interp_override {
			
 
				+        push(AT_BASE)?;
			
 
				+        push(r#override.at_entry)?;
			
 
				+    }
			
 
				     push(AT_ENTRY)?;
			
 
				-    push(pheaders)?;
			
 
				+    push(pheaders + size_of::<Header>())?;
			
 
				     push(AT_PHDR)?;
			
 
				-    push(header.e_phnum as usize)?;
			
 
				+    push(interp_override.as_ref().map_or(header.e_phnum as usize, |o| o.at_phnum))?;
			
 
				     push(AT_PHNUM)?;
			
 
				-    push(header.e_phentsize as usize)?;
			
 
				+    push(interp_override.as_ref().map_or(header.e_phentsize as usize, |o| o.at_phent))?;
			
 
				     push(AT_PHENT)?;
			
 
				 
			
 
				     let args_envs_size_aligned = (total_args_envs_size+PAGE_SIZE-1)/PAGE_SIZE*PAGE_SIZE;
			
@@ -176,14 +224,18 @@ where
 
				 
			
 
				     // TODO: Restore old name if exec failed?
			
 
				     if let Ok(name_fd) = syscall::dup(*open_via_dup, b"name").map(FdGuard::new) {
			
 
				-        let _ = syscall::write(*name_fd, path);
			
 
				+        let _ = syscall::write(*name_fd, interp_override.as_ref().map_or(path, |o| &o.name));
			
 
				+    }
			
 
				+    {
			
 
				+        let mmap_min_fd = FdGuard::new(syscall::dup(*open_via_dup, b"mmap-min-addr")?);
			
 
				+        let _ = syscall::write(*mmap_min_fd, &usize::to_ne_bytes(tree.iter().rev().nth(1).map_or(0, |(off, len)| *off + *len)));
			
 
				     }
			
 
				 
			
 
				     let addrspace_selection_fd = FdGuard::new(syscall::dup(*open_via_dup, b"current-addrspace")?);
			
 
				 
			
 
				     let _ = syscall::write(*addrspace_selection_fd, &create_set_addr_space_buf(*grants_fd, header.e_entry as usize, sp));
			
 
				 
			
 
				-    Ok(addrspace_selection_fd)
			
 
				+    Ok(FexecResult::Normal { addrspace_handle: addrspace_selection_fd })
			
 
				 }
			
 
				 fn mprotect_remote(socket: usize, addr: usize, len: usize, flags: MapFlags) -> Result<()> {
			
 
				     let mut grants_buf = [0_u8; 24];
			
@@ -295,3 +347,8 @@ pub fn create_set_addr_space_buf(space: usize, ip: usize, sp: usize) -> [u8; siz
 
				     *chunks.next().unwrap() = usize::to_ne_bytes(ip);
			
 
				     buf
			
 
				 }
			
 
				+
			
 
				+#[path = "../../../auxv_defs.rs"]
			
 
				+pub mod auxv_defs;
			
 
				+
			
 
				+use auxv_defs::*;