4 years ago · c975b0e9fe
--- a/src/int/specialized_div_rem/delegate.rs
+++ b/src/int/specialized_div_rem/delegate.rs
@@ -185,3 +185,133 @@ macro_rules! impl_delegate {
 
				         }
			
 
				     };
			
 
				 }
			
 
				+
			
 
				+/// Returns `n / d` and sets `*rem = n % d`.
			
 
				+///
			
 
				+/// This specialization exists because:
			
 
				+///  - The LLVM backend for 32-bit SPARC cannot compile functions that return `(u128, u128)`,
			
 
				+///    so we have to use an old fashioned `&mut u128` argument to return the remainder.
			
 
				+///  - 64-bit SPARC does not have u64 * u64 => u128 widening multiplication, which makes the
			
 
				+///    delegate algorithm strategy the only reasonably fast way to perform `u128` division.
			
 
				+#[doc(hidden)]
			
 
				+pub fn u128_divide_sparc(duo: u128, div: u128, rem: &mut u128) -> u128 {
			
 
				+    use super::*;
			
 
				+    let duo_lo = duo as u64;
			
 
				+    let duo_hi = (duo >> 64) as u64;
			
 
				+    let div_lo = div as u64;
			
 
				+    let div_hi = (div >> 64) as u64;
			
 
				+
			
 
				+    match (div_lo == 0, div_hi == 0, duo_hi == 0) {
			
 
				+        (true, true, _) => zero_div_fn(),
			
 
				+        (_, false, true) => {
			
 
				+            *rem = duo;
			
 
				+            return 0;
			
 
				+        }
			
 
				+        (false, true, true) => {
			
 
				+            let tmp = u64_by_u64_div_rem(duo_lo, div_lo);
			
 
				+            *rem = tmp.1 as u128;
			
 
				+            return tmp.0 as u128;
			
 
				+        }
			
 
				+        (false, true, false) => {
			
 
				+            if duo_hi < div_lo {
			
 
				+                let norm_shift = u64_normalization_shift(div_lo, duo_hi, false);
			
 
				+                let shl = if norm_shift == 0 {
			
 
				+                    64 - 1
			
 
				+                } else {
			
 
				+                    64 - norm_shift
			
 
				+                };
			
 
				+
			
 
				+                let mut div: u128 = div << shl;
			
 
				+                let mut pow_lo: u64 = 1 << shl;
			
 
				+                let mut quo_lo: u64 = 0;
			
 
				+                let mut duo = duo;
			
 
				+                loop {
			
 
				+                    let sub = duo.wrapping_sub(div);
			
 
				+                    if 0 <= (sub as i128) {
			
 
				+                        duo = sub;
			
 
				+                        quo_lo |= pow_lo;
			
 
				+                        let duo_hi = (duo >> 64) as u64;
			
 
				+                        if duo_hi == 0 {
			
 
				+                            let tmp = u64_by_u64_div_rem(duo as u64, div_lo);
			
 
				+                            *rem = tmp.1 as u128;
			
 
				+                            return (quo_lo | tmp.0) as u128;
			
 
				+                        }
			
 
				+                    }
			
 
				+                    div >>= 1;
			
 
				+                    pow_lo >>= 1;
			
 
				+                }
			
 
				+            } else if duo_hi == div_lo {
			
 
				+                let tmp = u64_by_u64_div_rem(duo as u64, div as u64);
			
 
				+                *rem = tmp.1 as u128;
			
 
				+                return (1 << 64) | (tmp.0 as u128);
			
 
				+            } else {
			
 
				+                if (div_lo >> 32) == 0 {
			
 
				+                    let div_0 = div_lo as u32 as u64;
			
 
				+                    let (quo_hi, rem_3) = u64_by_u64_div_rem(duo_hi, div_0);
			
 
				+
			
 
				+                    let duo_mid = ((duo >> 32) as u32 as u64) | (rem_3 << 32);
			
 
				+                    let (quo_1, rem_2) = u64_by_u64_div_rem(duo_mid, div_0);
			
 
				+
			
 
				+                    let duo_lo = (duo as u32 as u64) | (rem_2 << 32);
			
 
				+                    let (quo_0, rem_1) = u64_by_u64_div_rem(duo_lo, div_0);
			
 
				+
			
 
				+                    *rem = rem_1 as u128;
			
 
				+                    return (quo_0 as u128) | ((quo_1 as u128) << 32) | ((quo_hi as u128) << 64);
			
 
				+                }
			
 
				+
			
 
				+                let duo_lo = duo as u64;
			
 
				+                let tmp = u64_by_u64_div_rem(duo_hi, div_lo);
			
 
				+                let quo_hi = tmp.0;
			
 
				+                let mut duo = (duo_lo as u128) | ((tmp.1 as u128) << 64);
			
 
				+                if duo < div {
			
 
				+                    *rem = duo;
			
 
				+                    return (quo_hi as u128) << 64;
			
 
				+                }
			
 
				+
			
 
				+                let mut div: u128 = div << (64 - 1);
			
 
				+                let mut pow_lo: u64 = 1 << (64 - 1);
			
 
				+                let mut quo_lo: u64 = 0;
			
 
				+                loop {
			
 
				+                    let sub = duo.wrapping_sub(div);
			
 
				+                    if 0 <= (sub as i128) {
			
 
				+                        duo = sub;
			
 
				+                        quo_lo |= pow_lo;
			
 
				+                        let duo_hi = (duo >> 64) as u64;
			
 
				+                        if duo_hi == 0 {
			
 
				+                            let tmp = u64_by_u64_div_rem(duo as u64, div_lo);
			
 
				+                            *rem = tmp.1 as u128;
			
 
				+                            return (tmp.0) as u128 | (quo_lo as u128) | ((quo_hi as u128) << 64);
			
 
				+                        }
			
 
				+                    }
			
 
				+                    div >>= 1;
			
 
				+                    pow_lo >>= 1;
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+        (_, false, false) => {
			
 
				+            if duo < div {
			
 
				+                *rem = duo;
			
 
				+                return 0;
			
 
				+            }
			
 
				+            let div_original = div;
			
 
				+            let shl = u64_normalization_shift(duo_hi, div_hi, false);
			
 
				+            let mut duo = duo;
			
 
				+            let mut div: u128 = div << shl;
			
 
				+            let mut pow_lo: u64 = 1 << shl;
			
 
				+            let mut quo_lo: u64 = 0;
			
 
				+            loop {
			
 
				+                let sub = duo.wrapping_sub(div);
			
 
				+                if 0 <= (sub as i128) {
			
 
				+                    duo = sub;
			
 
				+                    quo_lo |= pow_lo;
			
 
				+                    if duo < div_original {
			
 
				+                        *rem = duo;
			
 
				+                        return quo_lo as u128;
			
 
				+                    }
			
 
				+                }
			
 
				+                div >>= 1;
			
 
				+                pow_lo >>= 1;
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+}
			
--- a/src/int/specialized_div_rem/mod.rs
+++ b/src/int/specialized_div_rem/mod.rs
@@ -46,6 +46,7 @@ mod binary_long;
 
				 
			
 
				 #[macro_use]
			
 
				 mod delegate;
			
 
				+pub use self::delegate::u128_divide_sparc;
			
 
				 
			
 
				 #[macro_use]
			
 
				 mod trifecta;
			
@@ -60,27 +61,31 @@ fn zero_div_fn() -> ! {
 
				     unsafe { core::hint::unreachable_unchecked() }
			
 
				 }
			
 
				 
			
 
				-// The `B` extension on RISC-V determines if a CLZ assembly instruction exists
			
 
				-#[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))]
			
 
				-const USE_LZ: bool = cfg!(target_feature = "b");
			
 
				-
			
 
				-#[cfg(target_arch = "arm")]
			
 
				-const USE_LZ: bool = if cfg!(target_feature = "thumb-mode") {
			
 
				-    // ARM thumb targets have CLZ instructions if the instruction set of ARMv6T2 is supported. This
			
 
				-    // is needed to successfully differentiate between targets like `thumbv8.base` and
			
 
				-    // `thumbv8.main`.
			
 
				-    cfg!(target_feature = "v6t2")
			
 
				-} else {
			
 
				-    // Regular ARM targets have CLZ instructions if the ARMv5TE instruction set is supported.
			
 
				-    // Technically, ARMv5T was the first to have CLZ, but the "v5t" target feature does not seem to
			
 
				-    // work.
			
 
				-    cfg!(target_feature = "v5te")
			
 
				+const USE_LZ: bool = {
			
 
				+    if cfg!(target_arch = "arm") {
			
 
				+        if cfg!(target_feature = "thumb-mode") {
			
 
				+            // ARM thumb targets have CLZ instructions if the instruction set of ARMv6T2 is
			
 
				+            // supported. This is needed to successfully differentiate between targets like
			
 
				+            // `thumbv8.base` and `thumbv8.main`.
			
 
				+            cfg!(target_feature = "v6t2")
			
 
				+        } else {
			
 
				+            // Regular ARM targets have CLZ instructions if the ARMv5TE instruction set is
			
 
				+            // supported. Technically, ARMv5T was the first to have CLZ, but the "v5t" target
			
 
				+            // feature does not seem to work.
			
 
				+            cfg!(target_feature = "v5te")
			
 
				+        }
			
 
				+    } else if cfg!(any(target_arch = "sparc", target_arch = "sparc64")) {
			
 
				+        // LZD or LZCNT on SPARC only exists for the VIS 3 extension and later.
			
 
				+        cfg!(target_feature = "vis3")
			
 
				+    } else if cfg!(any(target_arch = "riscv32", target_arch = "riscv64")) {
			
 
				+        // The `B` extension on RISC-V determines if a CLZ assembly instruction exists
			
 
				+        cfg!(target_feature = "b")
			
 
				+    } else {
			
 
				+        // All other common targets Rust supports should have CLZ instructions
			
 
				+        true
			
 
				+    }
			
 
				 };
			
 
				 
			
 
				-// All other targets Rust supports have CLZ instructions
			
 
				-#[cfg(not(any(target_arch = "arm", target_arch = "riscv32", target_arch = "riscv64")))]
			
 
				-const USE_LZ: bool = true;
			
 
				-
			
 
				 impl_normalization_shift!(
			
 
				     u32_normalization_shift,
			
 
				     USE_LZ,
			
@@ -115,8 +120,9 @@ fn u64_by_u64_div_rem(duo: u64, div: u64) -> (u64, u64) {
 
				 // microarchitecture can multiply and divide. We decide to be optimistic and assume `trifecta` is
			
 
				 // faster if the target pointer width is at least 64.
			
 
				 #[cfg(all(
			
 
				+    not(any(target_pointer_width = "16", target_pointer_width = "32")),
			
 
				     not(all(not(feature = "no-asm"), target_arch = "x86_64")),
			
 
				-    not(any(target_pointer_width = "16", target_pointer_width = "32"))
			
 
				+    not(any(target_arch = "sparc", target_arch = "sparc64"))
			
 
				 ))]
			
 
				 impl_trifecta!(
			
 
				     u128_div_rem,
			
@@ -131,8 +137,9 @@ impl_trifecta!(
 
				 // If the pointer width less than 64, then the target architecture almost certainly does not have
			
 
				 // the fast 64 to 128 bit widening multiplication needed for `trifecta` to be faster.
			
 
				 #[cfg(all(
			
 
				+    any(target_pointer_width = "16", target_pointer_width = "32"),
			
 
				     not(all(not(feature = "no-asm"), target_arch = "x86_64")),
			
 
				-    any(target_pointer_width = "16", target_pointer_width = "32")
			
 
				+    not(any(target_arch = "sparc", target_arch = "sparc64"))
			
 
				 ))]
			
 
				 impl_delegate!(
			
 
				     u128_div_rem,
			
--- a/src/int/udiv.rs
+++ b/src/int/udiv.rs
@@ -1,3 +1,4 @@
 
				+pub use int::specialized_div_rem::u128_divide_sparc;
			
 
				 use int::specialized_div_rem::*;
			
 
				 
			
 
				 intrinsics! {
			
@@ -46,25 +47,50 @@ intrinsics! {
 
				         quo_rem.0
			
 
				     }
			
 
				 
			
 
				+    // Note: we use block configuration and not `if cfg!(...)`, because we need to entirely disable
			
 
				+    // the existence of `u128_div_rem` to get 32-bit SPARC to compile, see `u128_divide_sparc` docs.
			
 
				+
			
 
				     #[win64_128bit_abi_hack]
			
 
				     /// Returns `n / d`
			
 
				     pub extern "C" fn __udivti3(n: u128, d: u128) -> u128 {
			
 
				-        u128_div_rem(n, d).0
			
 
				+        #[cfg(not(any(target_arch = "sparc", target_arch = "sparc64")))] {
			
 
				+            u128_div_rem(n, d).0
			
 
				+        }
			
 
				+        #[cfg(any(target_arch = "sparc", target_arch = "sparc64"))] {
			
 
				+            u128_divide_sparc(n, d, &mut 0)
			
 
				+        }
			
 
				     }
			
 
				 
			
 
				     #[win64_128bit_abi_hack]
			
 
				     /// Returns `n % d`
			
 
				     pub extern "C" fn __umodti3(n: u128, d: u128) -> u128 {
			
 
				-        u128_div_rem(n, d).1
			
 
				+        #[cfg(not(any(target_arch = "sparc", target_arch = "sparc64")))] {
			
 
				+            u128_div_rem(n, d).1
			
 
				+        }
			
 
				+        #[cfg(any(target_arch = "sparc", target_arch = "sparc64"))] {
			
 
				+            let mut rem = 0;
			
 
				+            u128_divide_sparc(n, d, &mut rem);
			
 
				+            rem
			
 
				+        }
			
 
				     }
			
 
				 
			
 
				     #[win64_128bit_abi_hack]
			
 
				     /// Returns `n / d` and sets `*rem = n % d`
			
 
				     pub extern "C" fn __udivmodti4(n: u128, d: u128, rem: Option<&mut u128>) -> u128 {
			
 
				-        let quo_rem = u128_div_rem(n, d);
			
 
				-        if let Some(rem) = rem {
			
 
				-            *rem = quo_rem.1;
			
 
				+        #[cfg(not(any(target_arch = "sparc", target_arch = "sparc64")))] {
			
 
				+            let quo_rem = u128_div_rem(n, d);
			
 
				+            if let Some(rem) = rem {
			
 
				+                *rem = quo_rem.1;
			
 
				+            }
			
 
				+            quo_rem.0
			
 
				+        }
			
 
				+        #[cfg(any(target_arch = "sparc", target_arch = "sparc64"))] {
			
 
				+            let mut tmp = 0;
			
 
				+            let quo = u128_divide_sparc(n, d, &mut tmp);
			
 
				+            if let Some(rem) = rem {
			
 
				+                *rem = tmp;
			
 
				+            }
			
 
				+            quo
			
 
				         }
			
 
				-        quo_rem.0
			
 
				     }
			
 
				 }