浏览代码

Use REP MOVSB/STOSB when the ERMSB feature is present (#392)

* Reorganize mem functions

This reduces the amount of platform-specific code

Signed-off-by: Joe Richey <joerichey@google.com>

* Use ERMSB implementations if the feature is set

Signed-off-by: Joe Richey <joerichey@google.com>

* Add non-aligned benchmarks

Signed-off-by: Joe Richey <joerichey@google.com>
Joseph Richey 4 年之前
父节点
当前提交
63c0091a61
共有 5 个文件被更改,包括 148 次插入88 次删除
  1. 29 0
      src/mem/impls.rs
  2. 0 41
      src/mem/memcpy.rs
  3. 26 2
      src/mem/mod.rs
  4. 37 21
      src/mem/x86_64.rs
  5. 56 24
      testcrate/benches/mem.rs

+ 29 - 0
src/mem/impls.rs

@@ -0,0 +1,29 @@
+use super::c_int;
+
+#[inline(always)]
+pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, n: usize) {
+    let mut i = 0;
+    while i < n {
+        *dest.offset(i as isize) = *src.offset(i as isize);
+        i += 1;
+    }
+}
+
+#[inline(always)]
+pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, n: usize) {
+    // copy from end
+    let mut i = n;
+    while i != 0 {
+        i -= 1;
+        *dest.offset(i as isize) = *src.offset(i as isize);
+    }
+}
+
+#[inline(always)]
+pub unsafe fn set_bytes(s: *mut u8, c: u8, n: usize) {
+    let mut i = 0;
+    while i < n {
+        *s.offset(i as isize) = c;
+        i += 1;
+    }
+}

+ 0 - 41
src/mem/memcpy.rs

@@ -1,41 +0,0 @@
-use super::c_int;
-
-#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
-pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 {
-    let mut i = 0;
-    while i < n {
-        *dest.offset(i as isize) = *src.offset(i as isize);
-        i += 1;
-    }
-    dest
-}
-
-#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
-pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 {
-    if src < dest as *const u8 {
-        // copy from end
-        let mut i = n;
-        while i != 0 {
-            i -= 1;
-            *dest.offset(i as isize) = *src.offset(i as isize);
-        }
-    } else {
-        // copy from beginning
-        let mut i = 0;
-        while i < n {
-            *dest.offset(i as isize) = *src.offset(i as isize);
-            i += 1;
-        }
-    }
-    dest
-}
-
-#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
-pub unsafe extern "C" fn memset(s: *mut u8, c: c_int, n: usize) -> *mut u8 {
-    let mut i = 0;
-    while i < n {
-        *s.offset(i as isize) = c as u8;
-        i += 1;
-    }
-    s
-}

+ 26 - 2
src/mem/mod.rs

@@ -11,8 +11,32 @@ use core::ops::{BitOr, Shl};
 
 // memcpy/memmove/memset have optimized implementations on some architectures
 #[cfg_attr(all(feature = "asm", target_arch = "x86_64"), path = "x86_64.rs")]
-mod memcpy;
-pub use self::memcpy::*;
+mod impls;
+
+#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
+pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 {
+    impls::copy_forward(dest, src, n);
+    dest
+}
+
+#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
+pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 {
+    let delta = (dest as usize).wrapping_sub(src as usize);
+    if delta >= n {
+        // We can copy forwards because either dest is far enough ahead of src,
+        // or src is ahead of dest (and delta overflowed).
+        impls::copy_forward(dest, src, n);
+    } else {
+        impls::copy_backward(dest, src, n);
+    }
+    dest
+}
+
+#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
+pub unsafe extern "C" fn memset(s: *mut u8, c: c_int, n: usize) -> *mut u8 {
+    impls::set_bytes(s, c as u8, n);
+    s
+}
 
 #[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
 pub unsafe extern "C" fn memcmp(s1: *const u8, s2: *const u8, n: usize) -> i32 {

+ 37 - 21
src/mem/x86_64.rs

@@ -1,5 +1,3 @@
-use super::c_int;
-
 // On most modern Intel and AMD processors, "rep movsq" and "rep stosq" have
 // been enhanced to perform better than an simple qword loop, making them ideal
 // for implementing memcpy/memset. Note that "rep cmps" has received no such
@@ -13,11 +11,26 @@ use super::c_int;
 //  - FSRM - Fast Short REP MOV (Ice Lake and later)
 //  - Fast Zero-Length MOVSB (On no current hardware)
 //  - Fast Short STOSB (On no current hardware)
-// However, to avoid run-time feature detection, we don't use these byte-based
-// instructions for most of the copying, preferring the qword variants.
+//
+// To simplify things, we switch to using the byte-based variants if the "ermsb"
+// feature is present at compile-time. We don't bother detecting other features.
+// Note that ERMSB does not enhance the backwards (DF=1) "rep movsb".
+
+#[inline(always)]
+#[cfg(target_feature = "ermsb")]
+pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
+    asm!(
+        "rep movsb [rdi], [rsi]",
+        inout("rcx") count => _,
+        inout("rdi") dest => _,
+        inout("rsi") src => _,
+        options(nostack, preserves_flags)
+    );
+}
 
-#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
-pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, count: usize) -> *mut u8 {
+#[inline(always)]
+#[cfg(not(target_feature = "ermsb"))]
+pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
     let qword_count = count >> 3;
     let byte_count = count & 0b111;
     asm!(
@@ -30,18 +43,10 @@ pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, count: usize) ->
         inout("rsi") src => _,
         options(nostack, preserves_flags)
     );
-    dest
 }
 
-#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
-pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, count: usize) -> *mut u8 {
-    let delta = (dest as usize).wrapping_sub(src as usize);
-    if delta >= count {
-        // We can copy forwards because either dest is far enough ahead of src,
-        // or src is ahead of dest (and delta overflowed).
-        return self::memcpy(dest, src, count);
-    }
-    // copy backwards
+#[inline(always)]
+pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) {
     let qword_count = count >> 3;
     let byte_count = count & 0b111;
     asm!(
@@ -58,11 +63,23 @@ pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, count: usize) ->
         inout("rsi") src.offset(count as isize).wrapping_sub(8) => _,
         options(nostack)
     );
-    dest
 }
 
-#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
-pub unsafe extern "C" fn memset(dest: *mut u8, c: c_int, count: usize) -> *mut u8 {
+#[inline(always)]
+#[cfg(target_feature = "ermsb")]
+pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {
+    asm!(
+        "rep stosb [rdi], al",
+        inout("rcx") count => _,
+        inout("rdi") dest => _,
+        inout("al") c => _,
+        options(nostack, preserves_flags)
+    )
+}
+
+#[inline(always)]
+#[cfg(not(target_feature = "ermsb"))]
+pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {
     let qword_count = count >> 3;
     let byte_count = count & 0b111;
     asm!(
@@ -72,8 +89,7 @@ pub unsafe extern "C" fn memset(dest: *mut u8, c: c_int, count: usize) -> *mut u
         byte_count = in(reg) byte_count,
         inout("rcx") qword_count => _,
         inout("rdi") dest => _,
-        in("rax") (c as u8 as u64) * 0x0101010101010101,
+        in("rax") (c as u64) * 0x0101010101010101,
         options(nostack, preserves_flags)
     );
-    dest
 }

+ 56 - 24
testcrate/benches/mem.rs

@@ -6,33 +6,33 @@ use test::{black_box, Bencher};
 extern crate compiler_builtins;
 use compiler_builtins::mem::{memcmp, memcpy, memmove, memset};
 
-fn memcpy_builtin(b: &mut Bencher, n: usize) {
-    let v1 = vec![1u8; n];
-    let mut v2 = vec![0u8; n];
+fn memcpy_builtin(b: &mut Bencher, n: usize, offset: usize) {
+    let v1 = vec![1u8; n + offset];
+    let mut v2 = vec![0u8; n + offset];
     b.bytes = n as u64;
     b.iter(|| {
-        let src: &[u8] = black_box(&v1);
-        let dst: &mut [u8] = black_box(&mut v2);
+        let src: &[u8] = black_box(&v1[offset..]);
+        let dst: &mut [u8] = black_box(&mut v2[offset..]);
         dst.copy_from_slice(src);
     })
 }
 
-fn memcpy_rust(b: &mut Bencher, n: usize) {
-    let v1 = vec![1u8; n];
-    let mut v2 = vec![0u8; n];
+fn memcpy_rust(b: &mut Bencher, n: usize, offset: usize) {
+    let v1 = vec![1u8; n + offset];
+    let mut v2 = vec![0u8; n + offset];
     b.bytes = n as u64;
     b.iter(|| {
-        let src: &[u8] = black_box(&v1);
-        let dst: &mut [u8] = black_box(&mut v2);
+        let src: &[u8] = black_box(&v1[offset..]);
+        let dst: &mut [u8] = black_box(&mut v2[offset..]);
         unsafe { memcpy(dst.as_mut_ptr(), src.as_ptr(), n) }
     })
 }
 
-fn memset_builtin(b: &mut Bencher, n: usize) {
-    let mut v1 = vec![0u8; n];
+fn memset_builtin(b: &mut Bencher, n: usize, offset: usize) {
+    let mut v1 = vec![0u8; n + offset];
     b.bytes = n as u64;
     b.iter(|| {
-        let dst: &mut [u8] = black_box(&mut v1);
+        let dst: &mut [u8] = black_box(&mut v1[offset..]);
         let val: u8 = black_box(27);
         for b in dst {
             *b = val;
@@ -40,11 +40,11 @@ fn memset_builtin(b: &mut Bencher, n: usize) {
     })
 }
 
-fn memset_rust(b: &mut Bencher, n: usize) {
-    let mut v1 = vec![0u8; n];
+fn memset_rust(b: &mut Bencher, n: usize, offset: usize) {
+    let mut v1 = vec![0u8; n + offset];
     b.bytes = n as u64;
     b.iter(|| {
-        let dst: &mut [u8] = black_box(&mut v1);
+        let dst: &mut [u8] = black_box(&mut v1[offset..]);
         let val = black_box(27);
         unsafe { memset(dst.as_mut_ptr(), val, n) }
     })
@@ -95,36 +95,68 @@ fn memmove_rust(b: &mut Bencher, n: usize) {
 
 #[bench]
 fn memcpy_builtin_4096(b: &mut Bencher) {
-    memcpy_builtin(b, 4096)
+    memcpy_builtin(b, 4096, 0)
 }
 #[bench]
 fn memcpy_rust_4096(b: &mut Bencher) {
-    memcpy_rust(b, 4096)
+    memcpy_rust(b, 4096, 0)
 }
 #[bench]
 fn memcpy_builtin_1048576(b: &mut Bencher) {
-    memcpy_builtin(b, 1048576)
+    memcpy_builtin(b, 1048576, 0)
 }
 #[bench]
 fn memcpy_rust_1048576(b: &mut Bencher) {
-    memcpy_rust(b, 1048576)
+    memcpy_rust(b, 1048576, 0)
+}
+#[bench]
+fn memcpy_builtin_4096_offset(b: &mut Bencher) {
+    memcpy_builtin(b, 4096, 65)
+}
+#[bench]
+fn memcpy_rust_4096_offset(b: &mut Bencher) {
+    memcpy_rust(b, 4096, 65)
+}
+#[bench]
+fn memcpy_builtin_1048576_offset(b: &mut Bencher) {
+    memcpy_builtin(b, 1048576, 65)
+}
+#[bench]
+fn memcpy_rust_1048576_offset(b: &mut Bencher) {
+    memcpy_rust(b, 1048576, 65)
 }
 
 #[bench]
 fn memset_builtin_4096(b: &mut Bencher) {
-    memset_builtin(b, 4096)
+    memset_builtin(b, 4096, 0)
 }
 #[bench]
 fn memset_rust_4096(b: &mut Bencher) {
-    memset_rust(b, 4096)
+    memset_rust(b, 4096, 0)
 }
 #[bench]
 fn memset_builtin_1048576(b: &mut Bencher) {
-    memset_builtin(b, 1048576)
+    memset_builtin(b, 1048576, 0)
 }
 #[bench]
 fn memset_rust_1048576(b: &mut Bencher) {
-    memset_rust(b, 1048576)
+    memset_rust(b, 1048576, 0)
+}
+#[bench]
+fn memset_builtin_4096_offset(b: &mut Bencher) {
+    memset_builtin(b, 4096, 65)
+}
+#[bench]
+fn memset_rust_4096_offset(b: &mut Bencher) {
+    memset_rust(b, 4096, 65)
+}
+#[bench]
+fn memset_builtin_1048576_offset(b: &mut Bencher) {
+    memset_builtin(b, 1048576, 65)
+}
+#[bench]
+fn memset_rust_1048576_offset(b: &mut Bencher) {
+    memset_rust(b, 1048576, 65)
 }
 
 #[bench]