Эх сурвалжийг харах

Use REP MOVSQ/STOSQ on x86_64 (#365)

* mem: Move mem* functions to separate directory

Signed-off-by: Joe Richey <joerichey@google.com>

* memcpy: Create separate memcpy.rs file

Signed-off-by: Joe Richey <joerichey@google.com>

* benches: Add benchmarks for mem* functions

This allows comparing the "normal" implementations to the
implementations provided by this crate.

Signed-off-by: Joe Richey <joerichey@google.com>

* mem: Add REP MOVSB/STOSB implementations

The assembly generated seems correct:
    https://rust.godbolt.org/z/GGnec8

Signed-off-by: Joe Richey <joerichey@google.com>

* mem: Add documentations for REP string insturctions

Signed-off-by: Joe Richey <joerichey@google.com>

* Use quad-word rep string instructions

Signed-off-by: Joe Richey <joerichey@google.com>

* Prevent panic when compiled in debug mode

Signed-off-by: Joe Richey <joerichey@google.com>

* Add tests for mem* functions

Signed-off-by: Joe Richey <joerichey@google.com>

* Add build/test with the "asm" feature

Signed-off-by: Joe Richey <joerichey@google.com>

* Add byte length to Bencher

Signed-off-by: Joe Richey <joerichey@google.com>
Joseph Richey 4 жил өмнө
parent
commit
33ad3669db

+ 4 - 0
ci/run.sh

@@ -12,12 +12,16 @@ else
     $run --release
     $run --features c
     $run --features c --release
+    $run --features asm
+    $run --features asm --release
 fi
 
 cargo build --target $1
 cargo build --target $1 --release
 cargo build --target $1 --features c
 cargo build --target $1 --release --features c
+cargo build --target $1 --features asm
+cargo build --target $1 --release --features asm
 
 PREFIX=$(echo $1 | sed -e 's/unknown-//')-
 case $1 in

+ 41 - 0
src/mem/memcpy.rs

@@ -0,0 +1,41 @@
+use super::c_int;
+
+#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
+pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 {
+    let mut i = 0;
+    while i < n {
+        *dest.offset(i as isize) = *src.offset(i as isize);
+        i += 1;
+    }
+    dest
+}
+
+#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
+pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 {
+    if src < dest as *const u8 {
+        // copy from end
+        let mut i = n;
+        while i != 0 {
+            i -= 1;
+            *dest.offset(i as isize) = *src.offset(i as isize);
+        }
+    } else {
+        // copy from beginning
+        let mut i = 0;
+        while i < n {
+            *dest.offset(i as isize) = *src.offset(i as isize);
+            i += 1;
+        }
+    }
+    dest
+}
+
+#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
+pub unsafe extern "C" fn memset(s: *mut u8, c: c_int, n: usize) -> *mut u8 {
+    let mut i = 0;
+    while i < n {
+        *s.offset(i as isize) = c as u8;
+        i += 1;
+    }
+    s
+}

+ 4 - 39
src/mem.rs → src/mem/mod.rs

@@ -9,45 +9,10 @@ use core::intrinsics::{atomic_load_unordered, atomic_store_unordered, exact_div}
 use core::mem;
 use core::ops::{BitOr, Shl};
 
-#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
-pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 {
-    let mut i = 0;
-    while i < n {
-        *dest.offset(i as isize) = *src.offset(i as isize);
-        i += 1;
-    }
-    dest
-}
-
-#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
-pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 {
-    if src < dest as *const u8 {
-        // copy from end
-        let mut i = n;
-        while i != 0 {
-            i -= 1;
-            *dest.offset(i as isize) = *src.offset(i as isize);
-        }
-    } else {
-        // copy from beginning
-        let mut i = 0;
-        while i < n {
-            *dest.offset(i as isize) = *src.offset(i as isize);
-            i += 1;
-        }
-    }
-    dest
-}
-
-#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
-pub unsafe extern "C" fn memset(s: *mut u8, c: c_int, n: usize) -> *mut u8 {
-    let mut i = 0;
-    while i < n {
-        *s.offset(i as isize) = c as u8;
-        i += 1;
-    }
-    s
-}
+// memcpy/memmove/memset have optimized implementations on some architectures
+#[cfg_attr(all(feature = "asm", target_arch = "x86_64"), path = "x86_64.rs")]
+mod memcpy;
+pub use self::memcpy::*;
 
 #[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
 pub unsafe extern "C" fn memcmp(s1: *const u8, s2: *const u8, n: usize) -> i32 {

+ 79 - 0
src/mem/x86_64.rs

@@ -0,0 +1,79 @@
+use super::c_int;
+
+// On most modern Intel and AMD processors, "rep movsq" and "rep stosq" have
+// been enhanced to perform better than an simple qword loop, making them ideal
+// for implementing memcpy/memset. Note that "rep cmps" has received no such
+// enhancement, so it is not used to implement memcmp.
+//
+// On certain recent Intel processors, "rep movsb" and "rep stosb" have been
+// further enhanced to automatically select the best microarchitectural
+// implementation based on length and alignment. See the following features from
+// the "Intel® 64 and IA-32 Architectures Optimization Reference Manual":
+//  - ERMSB - Enhanced REP MOVSB and STOSB (Ivy Bridge and later)
+//  - FSRM - Fast Short REP MOV (Ice Lake and later)
+//  - Fast Zero-Length MOVSB (On no current hardware)
+//  - Fast Short STOSB (On no current hardware)
+// However, to avoid run-time feature detection, we don't use these byte-based
+// instructions for most of the copying, preferring the qword variants.
+
+#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
+pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, count: usize) -> *mut u8 {
+    let qword_count = count >> 3;
+    let byte_count = count & 0b111;
+    asm!(
+        "rep movsq [rdi], [rsi]",
+        "mov ecx, {byte_count:e}",
+        "rep movsb [rdi], [rsi]",
+        byte_count = in(reg) byte_count,
+        inout("rcx") qword_count => _,
+        inout("rdi") dest => _,
+        inout("rsi") src => _,
+        options(nostack, preserves_flags)
+    );
+    dest
+}
+
+#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
+pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, count: usize) -> *mut u8 {
+    let delta = (dest as usize).wrapping_sub(src as usize);
+    if delta >= count {
+        // We can copy forwards because either dest is far enough ahead of src,
+        // or src is ahead of dest (and delta overflowed).
+        return self::memcpy(dest, src, count);
+    }
+    // copy backwards
+    let qword_count = count >> 3;
+    let byte_count = count & 0b111;
+    asm!(
+        "std",
+        "rep movsq [rdi], [rsi]",
+        "mov ecx, {byte_count:e}",
+        "add rdi, 7",
+        "add rsi, 7",
+        "rep movsb [rdi], [rsi]",
+        "cld",
+        byte_count = in(reg) byte_count,
+        inout("rcx") qword_count => _,
+        inout("rdi") dest.offset(count as isize).wrapping_sub(8) => _,
+        inout("rsi") src.offset(count as isize).wrapping_sub(8) => _,
+        options(nostack)
+    );
+    dest
+}
+
+#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
+pub unsafe extern "C" fn memset(dest: *mut u8, c: c_int, count: usize) -> *mut u8 {
+    let qword_count = count >> 3;
+    let byte_count = count & 0b111;
+    asm!(
+        "rep stosq [rdi], rax",
+        "mov ecx, {byte_count:e}",
+        "rep stosb [rdi], al",
+        byte_count = in(reg) byte_count,
+        inout("rcx") qword_count => _,
+        inout("rdi") dest => _,
+        in("rax") (c as u8 as u64) * 0x0101010101010101,
+        options(nostack, preserves_flags)
+    );
+    dest
+}

+ 162 - 0
testcrate/benches/mem.rs

@@ -0,0 +1,162 @@
+#![feature(test)]
+
+extern crate test;
+use test::{black_box, Bencher};
+
+extern crate compiler_builtins;
+use compiler_builtins::mem::{memcmp, memcpy, memmove, memset};
+
+fn memcpy_builtin(b: &mut Bencher, n: usize) {
+    let v1 = vec![1u8; n];
+    let mut v2 = vec![0u8; n];
+    b.bytes = n as u64;
+    b.iter(|| {
+        let src: &[u8] = black_box(&v1);
+        let dst: &mut [u8] = black_box(&mut v2);
+        dst.copy_from_slice(src);
+    })
+}
+
+fn memcpy_rust(b: &mut Bencher, n: usize) {
+    let v1 = vec![1u8; n];
+    let mut v2 = vec![0u8; n];
+    b.bytes = n as u64;
+    b.iter(|| {
+        let src: &[u8] = black_box(&v1);
+        let dst: &mut [u8] = black_box(&mut v2);
+        unsafe { memcpy(dst.as_mut_ptr(), src.as_ptr(), n) }
+    })
+}
+
+fn memset_builtin(b: &mut Bencher, n: usize) {
+    let mut v1 = vec![0u8; n];
+    b.bytes = n as u64;
+    b.iter(|| {
+        let dst: &mut [u8] = black_box(&mut v1);
+        let val: u8 = black_box(27);
+        for b in dst {
+            *b = val;
+        }
+    })
+}
+
+fn memset_rust(b: &mut Bencher, n: usize) {
+    let mut v1 = vec![0u8; n];
+    b.bytes = n as u64;
+    b.iter(|| {
+        let dst: &mut [u8] = black_box(&mut v1);
+        let val = black_box(27);
+        unsafe { memset(dst.as_mut_ptr(), val, n) }
+    })
+}
+
+fn memcmp_builtin(b: &mut Bencher, n: usize) {
+    let v1 = vec![0u8; n];
+    let mut v2 = vec![0u8; n];
+    v2[n - 1] = 1;
+    b.bytes = n as u64;
+    b.iter(|| {
+        let s1: &[u8] = black_box(&v1);
+        let s2: &[u8] = black_box(&v2);
+        s1.cmp(s2)
+    })
+}
+
+fn memcmp_rust(b: &mut Bencher, n: usize) {
+    let v1 = vec![0u8; n];
+    let mut v2 = vec![0u8; n];
+    v2[n - 1] = 1;
+    b.bytes = n as u64;
+    b.iter(|| {
+        let s1: &[u8] = black_box(&v1);
+        let s2: &[u8] = black_box(&v2);
+        unsafe { memcmp(s1.as_ptr(), s2.as_ptr(), n) }
+    })
+}
+
+fn memmove_builtin(b: &mut Bencher, n: usize) {
+    let mut v = vec![0u8; n + n / 2];
+    b.bytes = n as u64;
+    b.iter(|| {
+        let s: &mut [u8] = black_box(&mut v);
+        s.copy_within(0..n, n / 2);
+    })
+}
+
+fn memmove_rust(b: &mut Bencher, n: usize) {
+    let mut v = vec![0u8; n + n / 2];
+    b.bytes = n as u64;
+    b.iter(|| {
+        let dst: *mut u8 = black_box(&mut v[n / 2..]).as_mut_ptr();
+        let src: *const u8 = black_box(&v).as_ptr();
+        unsafe { memmove(dst, src, n) };
+    })
+}
+
+#[bench]
+fn memcpy_builtin_4096(b: &mut Bencher) {
+    memcpy_builtin(b, 4096)
+}
+#[bench]
+fn memcpy_rust_4096(b: &mut Bencher) {
+    memcpy_rust(b, 4096)
+}
+#[bench]
+fn memcpy_builtin_1048576(b: &mut Bencher) {
+    memcpy_builtin(b, 1048576)
+}
+#[bench]
+fn memcpy_rust_1048576(b: &mut Bencher) {
+    memcpy_rust(b, 1048576)
+}
+
+#[bench]
+fn memset_builtin_4096(b: &mut Bencher) {
+    memset_builtin(b, 4096)
+}
+#[bench]
+fn memset_rust_4096(b: &mut Bencher) {
+    memset_rust(b, 4096)
+}
+#[bench]
+fn memset_builtin_1048576(b: &mut Bencher) {
+    memset_builtin(b, 1048576)
+}
+#[bench]
+fn memset_rust_1048576(b: &mut Bencher) {
+    memset_rust(b, 1048576)
+}
+
+#[bench]
+fn memcmp_builtin_4096(b: &mut Bencher) {
+    memcmp_builtin(b, 4096)
+}
+#[bench]
+fn memcmp_rust_4096(b: &mut Bencher) {
+    memcmp_rust(b, 4096)
+}
+#[bench]
+fn memcmp_builtin_1048576(b: &mut Bencher) {
+    memcmp_builtin(b, 1048576)
+}
+#[bench]
+fn memcmp_rust_1048576(b: &mut Bencher) {
+    memcmp_rust(b, 1048576)
+}
+
+#[bench]
+fn memmove_builtin_4096(b: &mut Bencher) {
+    memmove_builtin(b, 4096)
+}
+#[bench]
+fn memmove_rust_4096(b: &mut Bencher) {
+    memmove_rust(b, 4096)
+}
+#[bench]
+fn memmove_builtin_1048576(b: &mut Bencher) {
+    memmove_builtin(b, 1048576)
+}
+#[bench]
+fn memmove_rust_1048576(b: &mut Bencher) {
+    memmove_rust(b, 1048576)
+}

+ 133 - 0
testcrate/tests/mem.rs

@@ -0,0 +1,133 @@
+extern crate compiler_builtins;
+use compiler_builtins::mem::{memcmp, memcpy, memmove, memset};
+
+#[test]
+fn memcpy_3() {
+    let mut arr: [u8; 12] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11];
+    unsafe {
+        let src = arr.as_ptr().offset(9);
+        let dst = arr.as_mut_ptr().offset(1);
+        assert_eq!(memcpy(dst, src, 3), dst);
+        assert_eq!(arr, [0, 9, 10, 11, 4, 5, 6, 7, 8, 9, 10, 11]);
+    }
+    arr = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11];
+    unsafe {
+        let src = arr.as_ptr().offset(1);
+        let dst = arr.as_mut_ptr().offset(9);
+        assert_eq!(memcpy(dst, src, 3), dst);
+        assert_eq!(arr, [0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3]);
+    }
+}
+
+#[test]
+fn memcpy_10() {
+    let arr: [u8; 18] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17];
+    let mut dst: [u8; 12] = [0; 12];
+    unsafe {
+        let src = arr.as_ptr().offset(1);
+        assert_eq!(memcpy(dst.as_mut_ptr(), src, 10), dst.as_mut_ptr());
+        assert_eq!(dst, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 0]);
+    }
+    unsafe {
+        let src = arr.as_ptr().offset(8);
+        assert_eq!(memcpy(dst.as_mut_ptr(), src, 10), dst.as_mut_ptr());
+        assert_eq!(dst, [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 0, 0]);
+    }
+}
+
+#[test]
+fn memcpy_big() {
+    // Make the arrays cross 3 pages
+    const SIZE: usize = 8193;
+    let src: [u8; SIZE] = [22; SIZE];
+    struct Dst {
+        start: usize,
+        buf: [u8; SIZE],
+        end: usize,
+    }
+
+    let mut dst = Dst {
+        start: 0,
+        buf: [0; SIZE],
+        end: 0,
+    };
+    unsafe {
+        assert_eq!(
+            memcpy(dst.buf.as_mut_ptr(), src.as_ptr(), SIZE),
+            dst.buf.as_mut_ptr()
+        );
+        assert_eq!(dst.start, 0);
+        assert_eq!(dst.buf, [22; SIZE]);
+        assert_eq!(dst.end, 0);
+    }
+}
+
+#[test]
+fn memmove_forward() {
+    let mut arr: [u8; 12] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11];
+    unsafe {
+        let src = arr.as_ptr().offset(6);
+        let dst = arr.as_mut_ptr().offset(3);
+        assert_eq!(memmove(dst, src, 5), dst);
+        assert_eq!(arr, [0, 1, 2, 6, 7, 8, 9, 10, 8, 9, 10, 11]);
+    }
+}
+
+#[test]
+fn memmove_backward() {
+    let mut arr: [u8; 12] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11];
+    unsafe {
+        let src = arr.as_ptr().offset(3);
+        let dst = arr.as_mut_ptr().offset(6);
+        assert_eq!(memmove(dst, src, 5), dst);
+        assert_eq!(arr, [0, 1, 2, 3, 4, 5, 3, 4, 5, 6, 7, 11]);
+    }
+}
+
+#[test]
+fn memset_zero() {
+    let mut arr: [u8; 8] = [0, 1, 2, 3, 4, 5, 6, 7];
+    unsafe {
+        let ptr = arr.as_mut_ptr().offset(5);
+        assert_eq!(memset(ptr, 0, 2), ptr);
+        assert_eq!(arr, [0, 1, 2, 3, 4, 0, 0, 7]);
+
+        // Only the LSB matters for a memset
+        assert_eq!(memset(arr.as_mut_ptr(), 0x2000, 8), arr.as_mut_ptr());
+        assert_eq!(arr, [0, 0, 0, 0, 0, 0, 0, 0]);
+    }
+}
+
+#[test]
+fn memset_nonzero() {
+    let mut arr: [u8; 8] = [0, 1, 2, 3, 4, 5, 6, 7];
+    unsafe {
+        let ptr = arr.as_mut_ptr().offset(2);
+        assert_eq!(memset(ptr, 22, 3), ptr);
+        assert_eq!(arr, [0, 1, 22, 22, 22, 5, 6, 7]);
+
+        // Only the LSB matters for a memset
+        assert_eq!(memset(arr.as_mut_ptr(), 0x2009, 8), arr.as_mut_ptr());
+        assert_eq!(arr, [9, 9, 9, 9, 9, 9, 9, 9]);
+    }
+}
+
+#[test]
+fn memcmp_eq() {
+    let arr1: [u8; 8] = [0, 1, 2, 3, 4, 5, 6, 7];
+    let arr2: [u8; 8] = [0, 1, 2, 3, 4, 5, 6, 7];
+    unsafe {
+        assert_eq!(memcmp(arr1.as_ptr(), arr2.as_ptr(), 8), 0);
+        assert_eq!(memcmp(arr1.as_ptr(), arr2.as_ptr(), 3), 0);
+    }
+}
+
+#[test]
+fn memcmp_ne() {
+    let arr1: [u8; 8] = [0, 1, 2, 3, 4, 5, 6, 7];
+    let arr2: [u8; 8] = [0, 1, 2, 3, 4, 5, 7, 7];
+    unsafe {
+        assert!(memcmp(arr1.as_ptr(), arr2.as_ptr(), 8) < 0);
+        assert!(memcmp(arr2.as_ptr(), arr1.as_ptr(), 8) > 0);
+    }
+}