5 years ago · 1220e671e1
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -40,6 +40,10 @@ panic-handler = { path = 'crates/panic-handler' }
 
				 [features]
			
 
				 default = ["compiler-builtins"]
			
 
				 
			
 
				+# Some algorithms benefit from inline assembly, but some compiler backends do
			
 
				+# not support it, so inline assembly is only enabled when this flag is set.
			
 
				+asm = []
			
 
				+
			
 
				 # Enable compilation of C code in compiler-rt, filling in some more optimized
			
 
				 # implementations and also filling in unimplemented intrinsics
			
 
				 c = ["cc"]
			
--- a/src/int/mod.rs
+++ b/src/int/mod.rs
@@ -1,16 +1,6 @@
 
				 use core::ops;
			
 
				 
			
 
				-macro_rules! hty {
			
 
				-    ($ty:ty) => {
			
 
				-        <$ty as LargeInt>::HighHalf
			
 
				-    };
			
 
				-}
			
 
				-
			
 
				-macro_rules! os_ty {
			
 
				-    ($ty:ty) => {
			
 
				-        <$ty as Int>::OtherSign
			
 
				-    };
			
 
				-}
			
 
				+mod specialized_div_rem;
			
 
				 
			
 
				 pub mod addsub;
			
 
				 pub mod leading_zeros;
			
--- a/src/int/sdiv.rs
+++ b/src/int/sdiv.rs
@@ -1,101 +1,65 @@
 
				-use int::Int;
			
 
				-
			
 
				-trait Div: Int {
			
 
				-    /// Returns `a / b`
			
 
				-    fn div(self, other: Self) -> Self {
			
 
				-        let s_a = self >> (Self::BITS - 1);
			
 
				-        let s_b = other >> (Self::BITS - 1);
			
 
				-        // NOTE it's OK to overflow here because of the `.unsigned()` below.
			
 
				-        // This whole operation is computing the absolute value of the inputs
			
 
				-        // So some overflow will happen when dealing with e.g. `i64::MIN`
			
 
				-        // where the absolute value is `(-i64::MIN) as u64`
			
 
				-        let a = (self ^ s_a).wrapping_sub(s_a);
			
 
				-        let b = (other ^ s_b).wrapping_sub(s_b);
			
 
				-        let s = s_a ^ s_b;
			
 
				-
			
 
				-        let r = a.unsigned().aborting_div(b.unsigned());
			
 
				-        (Self::from_unsigned(r) ^ s) - s
			
 
				-    }
			
 
				-}
			
 
				-
			
 
				-impl Div for i32 {}
			
 
				-impl Div for i64 {}
			
 
				-impl Div for i128 {}
			
 
				-
			
 
				-trait Mod: Int {
			
 
				-    /// Returns `a % b`
			
 
				-    fn mod_(self, other: Self) -> Self {
			
 
				-        let s = other >> (Self::BITS - 1);
			
 
				-        // NOTE(wrapping_sub) see comment in the `div`
			
 
				-        let b = (other ^ s).wrapping_sub(s);
			
 
				-        let s = self >> (Self::BITS - 1);
			
 
				-        let a = (self ^ s).wrapping_sub(s);
			
 
				-
			
 
				-        let r = a.unsigned().aborting_rem(b.unsigned());
			
 
				-        (Self::from_unsigned(r) ^ s) - s
			
 
				-    }
			
 
				-}
			
 
				-
			
 
				-impl Mod for i32 {}
			
 
				-impl Mod for i64 {}
			
 
				-impl Mod for i128 {}
			
 
				-
			
 
				-trait Divmod: Int {
			
 
				-    /// Returns `a / b` and sets `*rem = n % d`
			
 
				-    fn divmod<F>(self, other: Self, rem: &mut Self, div: F) -> Self
			
 
				-    where
			
 
				-        F: Fn(Self, Self) -> Self,
			
 
				-    {
			
 
				-        let r = div(self, other);
			
 
				-        // NOTE won't overflow because it's using the result from the
			
 
				-        // previous division
			
 
				-        *rem = self - r.wrapping_mul(other);
			
 
				-        r
			
 
				-    }
			
 
				-}
			
 
				-
			
 
				-impl Divmod for i32 {}
			
 
				-impl Divmod for i64 {}
			
 
				+use int::specialized_div_rem::*;
			
 
				 
			
 
				 intrinsics! {
			
 
				     #[maybe_use_optimized_c_shim]
			
 
				     #[arm_aeabi_alias = __aeabi_idiv]
			
 
				+    /// Returns `n / d`
			
 
				     pub extern "C" fn __divsi3(a: i32, b: i32) -> i32 {
			
 
				-        a.div(b)
			
 
				+        i32_div_rem(a, b).0
			
 
				     }
			
 
				 
			
 
				     #[maybe_use_optimized_c_shim]
			
 
				-    pub extern "C" fn __divdi3(a: i64, b: i64) -> i64 {
			
 
				-        a.div(b)
			
 
				+    /// Returns `n % d`
			
 
				+    pub extern "C" fn __modsi3(a: i32, b: i32) -> i32 {
			
 
				+        i32_div_rem(a, b).1
			
 
				     }
			
 
				 
			
 
				-    #[win64_128bit_abi_hack]
			
 
				-    pub extern "C" fn __divti3(a: i128, b: i128) -> i128 {
			
 
				-        a.div(b)
			
 
				+    #[maybe_use_optimized_c_shim]
			
 
				+    /// Returns `n / d` and sets `*rem = n % d`
			
 
				+    pub extern "C" fn __divmodsi4(a: i32, b: i32, rem: &mut i32) -> i32 {
			
 
				+        let quo_rem = i32_div_rem(a, b);
			
 
				+        *rem = quo_rem.1;
			
 
				+        quo_rem.0
			
 
				     }
			
 
				 
			
 
				     #[maybe_use_optimized_c_shim]
			
 
				-    pub extern "C" fn __modsi3(a: i32, b: i32) -> i32 {
			
 
				-        a.mod_(b)
			
 
				+    /// Returns `n / d`
			
 
				+    pub extern "C" fn __divdi3(a: i64, b: i64) -> i64 {
			
 
				+        i64_div_rem(a, b).0
			
 
				     }
			
 
				 
			
 
				     #[maybe_use_optimized_c_shim]
			
 
				+    /// Returns `n % d`
			
 
				     pub extern "C" fn __moddi3(a: i64, b: i64) -> i64 {
			
 
				-        a.mod_(b)
			
 
				+        i64_div_rem(a, b).1
			
 
				+    }
			
 
				+
			
 
				+    #[maybe_use_optimized_c_shim]
			
 
				+    /// Returns `n / d` and sets `*rem = n % d`
			
 
				+    pub extern "C" fn __divmoddi4(a: i64, b: i64, rem: &mut i64) -> i64 {
			
 
				+        let quo_rem = i64_div_rem(a, b);
			
 
				+        *rem = quo_rem.1;
			
 
				+        quo_rem.0
			
 
				     }
			
 
				 
			
 
				     #[win64_128bit_abi_hack]
			
 
				-    pub extern "C" fn __modti3(a: i128, b: i128) -> i128 {
			
 
				-        a.mod_(b)
			
 
				+    /// Returns `n / d`
			
 
				+    pub extern "C" fn __divti3(a: i128, b: i128) -> i128 {
			
 
				+        i128_div_rem(a, b).0
			
 
				     }
			
 
				 
			
 
				-    #[maybe_use_optimized_c_shim]
			
 
				-    pub extern "C" fn __divmodsi4(a: i32, b: i32, rem: &mut i32) -> i32 {
			
 
				-        a.divmod(b, rem, |a, b| __divsi3(a, b))
			
 
				+    #[win64_128bit_abi_hack]
			
 
				+    /// Returns `n % d`
			
 
				+    pub extern "C" fn __modti3(a: i128, b: i128) -> i128 {
			
 
				+        i128_div_rem(a, b).1
			
 
				     }
			
 
				 
			
 
				-    #[aapcs_on_arm]
			
 
				-    pub extern "C" fn __divmoddi4(a: i64, b: i64, rem: &mut i64) -> i64 {
			
 
				-        a.divmod(b, rem, |a, b| __divdi3(a, b))
			
 
				+    // LLVM does not currently have a `__divmodti4` function, but GCC does
			
 
				+    #[maybe_use_optimized_c_shim]
			
 
				+    /// Returns `n / d` and sets `*rem = n % d`
			
 
				+    pub extern "C" fn __divmodti4(a: i128, b: i128, rem: &mut i128) -> i128 {
			
 
				+        let quo_rem = i128_div_rem(a, b);
			
 
				+        *rem = quo_rem.1;
			
 
				+        quo_rem.0
			
 
				     }
			
 
				 }
			
--- a/src/int/specialized_div_rem/asymmetric.rs
+++ b/src/int/specialized_div_rem/asymmetric.rs
@@ -0,0 +1,169 @@
 
				+/// Creates unsigned and signed division functions optimized for dividing integers with the same
			
 
				+/// bitwidth as the largest operand in an asymmetrically sized division. For example, x86-64 has an
			
 
				+/// assembly instruction that can divide a 128 bit integer by a 64 bit integer if the quotient fits
			
 
				+/// in 64 bits. The 128 bit version of this algorithm would use that fast hardware division to
			
 
				+/// construct a full 128 bit by 128 bit division.
			
 
				+#[macro_export]
			
 
				+macro_rules! impl_asymmetric {
			
 
				+    (
			
 
				+        $unsigned_name:ident, // name of the unsigned division function
			
 
				+        $signed_name:ident, // name of the signed division function
			
 
				+        $zero_div_fn:ident, // function called when division by zero is attempted
			
 
				+        $half_division:ident, // function for division of a $uX by a $uX
			
 
				+        $asymmetric_division:ident, // function for division of a $uD by a $uX
			
 
				+        $n_h:expr, // the number of bits in a $iH or $uH
			
 
				+        $uH:ident, // unsigned integer with half the bit width of $uX
			
 
				+        $uX:ident, // unsigned integer with half the bit width of $uD
			
 
				+        $uD:ident, // unsigned integer type for the inputs and outputs of `$unsigned_name`
			
 
				+        $iD:ident, // signed integer type for the inputs and outputs of `$signed_name`
			
 
				+        $($unsigned_attr:meta),*; // attributes for the unsigned function
			
 
				+        $($signed_attr:meta),* // attributes for the signed function
			
 
				+    ) => {
			
 
				+        /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
			
 
				+        /// tuple.
			
 
				+        $(
			
 
				+            #[$unsigned_attr]
			
 
				+        )*
			
 
				+        pub fn $unsigned_name(duo: $uD, div: $uD) -> ($uD,$uD) {
			
 
				+            fn carrying_mul(lhs: $uX, rhs: $uX) -> ($uX, $uX) {
			
 
				+                let tmp = (lhs as $uD).wrapping_mul(rhs as $uD);
			
 
				+                (tmp as $uX, (tmp >> ($n_h * 2)) as $uX)
			
 
				+            }
			
 
				+            fn carrying_mul_add(lhs: $uX, mul: $uX, add: $uX) -> ($uX, $uX) {
			
 
				+                let tmp = (lhs as $uD).wrapping_mul(mul as $uD).wrapping_add(add as $uD);
			
 
				+                (tmp as $uX, (tmp >> ($n_h * 2)) as $uX)
			
 
				+            }
			
 
				+
			
 
				+            let n: u32 = $n_h * 2;
			
 
				+
			
 
				+            // Many of these subalgorithms are taken from trifecta.rs, see that for better
			
 
				+            // documentation.
			
 
				+
			
 
				+            let duo_lo = duo as $uX;
			
 
				+            let duo_hi = (duo >> n) as $uX;
			
 
				+            let div_lo = div as $uX;
			
 
				+            let div_hi = (div >> n) as $uX;
			
 
				+            if div_hi == 0 {
			
 
				+                if div_lo == 0 {
			
 
				+                    $zero_div_fn()
			
 
				+                }
			
 
				+                if duo_hi < div_lo {
			
 
				+                    // `$uD` by `$uX` division with a quotient that will fit into a `$uX`
			
 
				+                    let (quo, rem) = unsafe { $asymmetric_division(duo, div_lo) };
			
 
				+                    return (quo as $uD, rem as $uD)
			
 
				+                } else if (div_lo >> $n_h) == 0 {
			
 
				+                    // Short division of $uD by a $uH.
			
 
				+
			
 
				+                    // Some x86_64 CPUs have bad division implementations that make specializing
			
 
				+                    // this case faster.
			
 
				+                    let div_0 = div_lo as $uH as $uX;
			
 
				+                    let (quo_hi, rem_3) = $half_division(duo_hi, div_0);
			
 
				+
			
 
				+                    let duo_mid =
			
 
				+                        ((duo >> $n_h) as $uH as $uX)
			
 
				+                        | (rem_3 << $n_h);
			
 
				+                    let (quo_1, rem_2) = $half_division(duo_mid, div_0);
			
 
				+
			
 
				+                    let duo_lo =
			
 
				+                        (duo as $uH as $uX)
			
 
				+                        | (rem_2 << $n_h);
			
 
				+                    let (quo_0, rem_1) = $half_division(duo_lo, div_0);
			
 
				+
			
 
				+                    return (
			
 
				+                        (quo_0 as $uD)
			
 
				+                        | ((quo_1 as $uD) << $n_h)
			
 
				+                        | ((quo_hi as $uD) << n),
			
 
				+                        rem_1 as $uD
			
 
				+                    )
			
 
				+                } else {
			
 
				+                    // Short division using the $uD by $uX division
			
 
				+                    let (quo_hi, rem_hi) = $half_division(duo_hi, div_lo);
			
 
				+                    let tmp = unsafe {
			
 
				+                        $asymmetric_division((duo_lo as $uD) | ((rem_hi as $uD) << n), div_lo)
			
 
				+                    };
			
 
				+                    return ((tmp.0 as $uD) | ((quo_hi as $uD) << n), tmp.1 as $uD)
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+            let duo_lz = duo_hi.leading_zeros();
			
 
				+            let div_lz = div_hi.leading_zeros();
			
 
				+            let rel_leading_sb = div_lz.wrapping_sub(duo_lz);
			
 
				+            if rel_leading_sb < $n_h {
			
 
				+                // Some x86_64 CPUs have bad hardware division implementations that make putting
			
 
				+                // a two possibility algorithm here beneficial. We also avoid a full `$uD`
			
 
				+                // multiplication.
			
 
				+                let shift = n - duo_lz;
			
 
				+                let duo_sig_n = (duo >> shift) as $uX;
			
 
				+                let div_sig_n = (div >> shift) as $uX;
			
 
				+                let quo = $half_division(duo_sig_n, div_sig_n).0;
			
 
				+                let div_lo = div as $uX;
			
 
				+                let div_hi = (div >> n) as $uX;
			
 
				+                let (tmp_lo, carry) = carrying_mul(quo, div_lo);
			
 
				+                let (tmp_hi, overflow) = carrying_mul_add(quo, div_hi, carry);
			
 
				+                let tmp = (tmp_lo as $uD) | ((tmp_hi as $uD) << n);
			
 
				+                if (overflow != 0) || (duo < tmp) {
			
 
				+                    return (
			
 
				+                        (quo - 1) as $uD,
			
 
				+                        duo.wrapping_add(div).wrapping_sub(tmp)
			
 
				+                    )
			
 
				+                } else {
			
 
				+                    return (
			
 
				+                        quo as $uD,
			
 
				+                        duo - tmp
			
 
				+                    )
			
 
				+                }
			
 
				+            } else {
			
 
				+                // This has been adapted from
			
 
				+                // https://www.codeproject.com/tips/785014/uint-division-modulus which was in turn
			
 
				+                // adapted from Hacker's Delight. This is similar to the two possibility algorithm
			
 
				+                // in that it uses only more significant parts of `duo` and `div` to divide a large
			
 
				+                // integer with a smaller division instruction.
			
 
				+
			
 
				+                let div_extra = n - div_lz;
			
 
				+                let div_sig_n = (div >> div_extra) as $uX;
			
 
				+                let tmp = unsafe {
			
 
				+                    $asymmetric_division(duo >> 1, div_sig_n)
			
 
				+                };
			
 
				+
			
 
				+                let mut quo = tmp.0 >> ((n - 1) - div_lz);
			
 
				+                if quo != 0 {
			
 
				+                    quo -= 1;
			
 
				+                }
			
 
				+
			
 
				+                // Note that this is a full `$uD` multiplication being used here
			
 
				+                let mut rem = duo - (quo as $uD).wrapping_mul(div);
			
 
				+                if div <= rem {
			
 
				+                    quo += 1;
			
 
				+                    rem -= div;
			
 
				+                }
			
 
				+                return (quo as $uD, rem)
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
			
 
				+        /// tuple.
			
 
				+        $(
			
 
				+            #[$signed_attr]
			
 
				+        )*
			
 
				+        pub fn $signed_name(duo: $iD, div: $iD) -> ($iD, $iD) {
			
 
				+            match (duo < 0, div < 0) {
			
 
				+                (false, false) => {
			
 
				+                    let t = $unsigned_name(duo as $uD, div as $uD);
			
 
				+                    (t.0 as $iD, t.1 as $iD)
			
 
				+                },
			
 
				+                (true, false) => {
			
 
				+                    let t = $unsigned_name(duo.wrapping_neg() as $uD, div as $uD);
			
 
				+                    ((t.0 as $iD).wrapping_neg(), (t.1 as $iD).wrapping_neg())
			
 
				+                },
			
 
				+                (false, true) => {
			
 
				+                    let t = $unsigned_name(duo as $uD, div.wrapping_neg() as $uD);
			
 
				+                    ((t.0 as $iD).wrapping_neg(), t.1 as $iD)
			
 
				+                },
			
 
				+                (true, true) => {
			
 
				+                    let t = $unsigned_name(duo.wrapping_neg() as $uD, div.wrapping_neg() as $uD);
			
 
				+                    (t.0 as $iD, (t.1 as $iD).wrapping_neg())
			
 
				+                },
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+}
			
--- a/src/int/specialized_div_rem/binary_long.rs
+++ b/src/int/specialized_div_rem/binary_long.rs
@@ -0,0 +1,596 @@
 
				+/// Creates unsigned and signed division functions that use binary long division, designed for
			
 
				+/// computer architectures without division instructions. These functions have good performance for
			
 
				+/// microarchitectures with large branch miss penalties and architectures without the ability to
			
 
				+/// predicate instructions. For architectures with predicated instructions, one of the algorithms
			
 
				+/// described in the documentation of these functions probably has higher performance, and a custom
			
 
				+/// assembly routine should be used instead.
			
 
				+#[macro_export]
			
 
				+macro_rules! impl_binary_long {
			
 
				+    (
			
 
				+        $unsigned_name:ident, // name of the unsigned division function
			
 
				+        $signed_name:ident, // name of the signed division function
			
 
				+        $zero_div_fn:ident, // function called when division by zero is attempted
			
 
				+        $normalization_shift:ident, // function for finding the normalization shift
			
 
				+        $n:tt, // the number of bits in a $iX or $uX
			
 
				+        $uX:ident, // unsigned integer type for the inputs and outputs of `$unsigned_name`
			
 
				+        $iX:ident, // signed integer type for the inputs and outputs of `$signed_name`
			
 
				+        $($unsigned_attr:meta),*; // attributes for the unsigned function
			
 
				+        $($signed_attr:meta),* // attributes for the signed function
			
 
				+    ) => {
			
 
				+        /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
			
 
				+        /// tuple.
			
 
				+        $(
			
 
				+            #[$unsigned_attr]
			
 
				+        )*
			
 
				+        pub fn $unsigned_name(duo: $uX, div: $uX) -> ($uX, $uX) {
			
 
				+            let mut duo = duo;
			
 
				+            // handle edge cases before calling `$normalization_shift`
			
 
				+            if div == 0 {
			
 
				+                $zero_div_fn()
			
 
				+            }
			
 
				+            if duo < div {
			
 
				+                return (0, duo)
			
 
				+            }
			
 
				+
			
 
				+            // There are many variations of binary division algorithm that could be used. This
			
 
				+            // documentation gives a tour of different methods so that future readers wanting to
			
 
				+            // optimize further do not have to painstakingly derive them. The SWAR variation is
			
 
				+            // especially hard to understand without reading the less convoluted methods first.
			
 
				+
			
 
				+            // You may notice that a `duo < div_original` check is included in many these
			
 
				+            // algorithms. A critical optimization that many algorithms miss is handling of
			
 
				+            // quotients that will turn out to have many trailing zeros or many leading zeros. This
			
 
				+            // happens in cases of exact or close-to-exact divisions, divisions by power of two, and
			
 
				+            // in cases where the quotient is small. The `duo < div_original` check handles these
			
 
				+            // cases of early returns and ends up replacing other kinds of mundane checks that
			
 
				+            // normally terminate a binary division algorithm.
			
 
				+            //
			
 
				+            // Something you may see in other algorithms that is not special-cased here is checks
			
 
				+            // for division by powers of two. The `duo < div_original` check handles this case and
			
 
				+            // more, however it can be checked up front before the bisection using the
			
 
				+            // `((div > 0) && ((div & (div - 1)) == 0))` trick. This is not special-cased because
			
 
				+            // compilers should handle most cases where divisions by power of two occur, and we do
			
 
				+            // not want to add on a few cycles for every division operation just to save a few
			
 
				+            // cycles rarely.
			
 
				+
			
 
				+            // The following example is the most straightforward translation from the way binary
			
 
				+            // long division is typically visualized:
			
 
				+            // Dividing 178u8 (0b10110010) by 6u8 (0b110). `div` is shifted left by 5, according to
			
 
				+            // the result from `$normalization_shift(duo, div, false)`.
			
 
				+            //
			
 
				+            // Step 0: `sub` is negative, so there is not full normalization, so no `quo` bit is set
			
 
				+            // and `duo` is kept unchanged.
			
 
				+            // duo:10110010, div_shifted:11000000, sub:11110010, quo:00000000, shl:5
			
 
				+            //
			
 
				+            // Step 1: `sub` is positive, set a `quo` bit and update `duo` for next step.
			
 
				+            // duo:10110010, div_shifted:01100000, sub:01010010, quo:00010000, shl:4
			
 
				+            //
			
 
				+            // Step 2: Continue based on `sub`. The `quo` bits start accumulating.
			
 
				+            // duo:01010010, div_shifted:00110000, sub:00100010, quo:00011000, shl:3
			
 
				+            // duo:00100010, div_shifted:00011000, sub:00001010, quo:00011100, shl:2
			
 
				+            // duo:00001010, div_shifted:00001100, sub:11111110, quo:00011100, shl:1
			
 
				+            // duo:00001010, div_shifted:00000110, sub:00000100, quo:00011100, shl:0
			
 
				+            // The `duo < div_original` check terminates the algorithm with the correct quotient of
			
 
				+            // 29u8 and remainder of 4u8
			
 
				+            /*
			
 
				+            let div_original = div;
			
 
				+            let mut shl = $normalization_shift(duo, div, false);
			
 
				+            let mut quo = 0;
			
 
				+            loop {
			
 
				+                let div_shifted = div << shl;
			
 
				+                let sub = duo.wrapping_sub(div_shifted);
			
 
				+                // it is recommended to use `println!`s like this if functionality is unclear
			
 
				+                /*
			
 
				+                println!("duo:{:08b}, div_shifted:{:08b}, sub:{:08b}, quo:{:08b}, shl:{}",
			
 
				+                    duo,
			
 
				+                    div_shifted,
			
 
				+                    sub,
			
 
				+                    quo,
			
 
				+                    shl
			
 
				+                );
			
 
				+                */
			
 
				+                if 0 <= (sub as $iX) {
			
 
				+                    duo = sub;
			
 
				+                    quo += 1 << shl;
			
 
				+                    if duo < div_original {
			
 
				+                        // this branch is optional
			
 
				+                        return (quo, duo)
			
 
				+                    }
			
 
				+                }
			
 
				+                if shl == 0 {
			
 
				+                    return (quo, duo)
			
 
				+                }
			
 
				+                shl -= 1;
			
 
				+            }
			
 
				+            */
			
 
				+
			
 
				+            // This restoring binary long division algorithm reduces the number of operations
			
 
				+            // overall via:
			
 
				+            // - `pow` can be shifted right instead of recalculating from `shl`
			
 
				+            // - starting `div` shifted left and shifting it right for each step instead of
			
 
				+            //   recalculating from `shl`
			
 
				+            // - The `duo < div_original` branch is used to terminate the algorithm instead of the
			
 
				+            //   `shl == 0` branch. This check is strong enough to prevent set bits of `pow` and
			
 
				+            //   `div` from being shifted off the end. This check also only occurs on half of steps
			
 
				+            //   on average, since it is behind the `(sub as $iX) >= 0` branch.
			
 
				+            // - `shl` is now not needed by any aspect of of the loop and thus only 3 variables are
			
 
				+            //   being updated between steps
			
 
				+            //
			
 
				+            // There are many variations of this algorithm, but this encompases the largest number
			
 
				+            // of architectures and does not rely on carry flags, add-with-carry, or SWAR
			
 
				+            // complications to be decently fast.
			
 
				+            /*
			
 
				+            let div_original = div;
			
 
				+            let shl = $normalization_shift(duo, div, false);
			
 
				+            let mut div: $uX = div << shl;
			
 
				+            let mut pow: $uX = 1 << shl;
			
 
				+            let mut quo: $uX = 0;
			
 
				+            loop {
			
 
				+                let sub = duo.wrapping_sub(div);
			
 
				+                if 0 <= (sub as $iX) {
			
 
				+                    duo = sub;
			
 
				+                    quo |= pow;
			
 
				+                    if duo < div_original {
			
 
				+                        return (quo, duo)
			
 
				+                    }
			
 
				+                }
			
 
				+                div >>= 1;
			
 
				+                pow >>= 1;
			
 
				+            }
			
 
				+            */
			
 
				+
			
 
				+            // If the architecture has flags and predicated arithmetic instructions, it is possible
			
 
				+            // to do binary long division without branching and in only 3 or 4 instructions. This is
			
 
				+            // a variation of a 3 instruction central loop from
			
 
				+            // http://www.chiark.greenend.org.uk/~theom/riscos/docs/ultimate/a252div.txt.
			
 
				+            //
			
 
				+            // What allows doing division in only 3 instructions is realizing that instead of
			
 
				+            // keeping `duo` in place and shifting `div` right to align bits, `div` can be kept in
			
 
				+            // place and `duo` can be shifted left. This means `div` does not have to be updated,
			
 
				+            // but causes edge case problems and makes `duo < div_original` tests harder. Some
			
 
				+            // architectures have an option to shift an argument in an arithmetic operation, which
			
 
				+            // means `duo` can be shifted left and subtracted from in one instruction. The other two
			
 
				+            // instructions are updating `quo` and undoing the subtraction if it turns out things
			
 
				+            // were not normalized.
			
 
				+
			
 
				+            /*
			
 
				+            // Perform one binary long division step on the already normalized arguments, because
			
 
				+            // the main. Note that this does a full normalization since the central loop needs
			
 
				+            // `duo.leading_zeros()` to be at least 1 more than `div.leading_zeros()`. The original
			
 
				+            // variation only did normalization to the nearest 4 steps, but this makes handling edge
			
 
				+            // cases much harder. We do a full normalization and perform a binary long division
			
 
				+            // step. In the edge case where the msbs of `duo` and `div` are set, it clears the msb
			
 
				+            // of `duo`, then the edge case handler shifts `div` right and does another long
			
 
				+            // division step to always insure `duo.leading_zeros() + 1 >= div.leading_zeros()`.
			
 
				+            let div_original = div;
			
 
				+            let mut shl = $normalization_shift(duo, div, true);
			
 
				+            let mut div: $uX = (div << shl);
			
 
				+            let mut quo: $uX = 1;
			
 
				+            duo = duo.wrapping_sub(div);
			
 
				+            if duo < div_original {
			
 
				+                return (1 << shl, duo);
			
 
				+            }
			
 
				+            let div_neg: $uX;
			
 
				+            if (div as $iX) < 0 {
			
 
				+                // A very ugly edge case where the most significant bit of `div` is set (after
			
 
				+                // shifting to match `duo` when its most significant bit is at the sign bit), which
			
 
				+                // leads to the sign bit of `div_neg` being cut off and carries not happening when
			
 
				+                // they should. This branch performs a long division step that keeps `duo` in place
			
 
				+                // and shifts `div` down.
			
 
				+                div >>= 1;
			
 
				+                div_neg = div.wrapping_neg();
			
 
				+                let (sub, carry) = duo.overflowing_add(div_neg);
			
 
				+                duo = sub;
			
 
				+                quo = quo.wrapping_add(quo).wrapping_add(carry as $uX);
			
 
				+                if !carry {
			
 
				+                    duo = duo.wrapping_add(div);
			
 
				+                }
			
 
				+                shl -= 1;
			
 
				+            } else {
			
 
				+                div_neg = div.wrapping_neg();
			
 
				+            }
			
 
				+            // The add-with-carry that updates `quo` needs to have the carry set when a normalized
			
 
				+            // subtract happens. Using `duo.wrapping_shl(1).overflowing_sub(div)` to do the
			
 
				+            // subtraction generates a carry when an unnormalized subtract happens, which is the
			
 
				+            // opposite of what we want. Instead, we use
			
 
				+            // `duo.wrapping_shl(1).overflowing_add(div_neg)`, where `div_neg` is negative `div`.
			
 
				+            let mut i = shl;
			
 
				+            loop {
			
 
				+                if i == 0 {
			
 
				+                    break;
			
 
				+                }
			
 
				+                i -= 1;
			
 
				+                // `ADDS duo, div, duo, LSL #1`
			
 
				+                // (add `div` to `duo << 1` and set flags)
			
 
				+                let (sub, carry) = duo.wrapping_shl(1).overflowing_add(div_neg);
			
 
				+                duo = sub;
			
 
				+                // `ADC quo, quo, quo`
			
 
				+                // (add with carry). Effectively shifts `quo` left by 1 and sets the least
			
 
				+                // significant bit to the carry.
			
 
				+                quo = quo.wrapping_add(quo).wrapping_add(carry as $uX);
			
 
				+                // `ADDCC duo, duo, div`
			
 
				+                // (add if carry clear). Undoes the subtraction if no carry was generated.
			
 
				+                if !carry {
			
 
				+                    duo = duo.wrapping_add(div);
			
 
				+                }
			
 
				+            }
			
 
				+            return (quo, duo >> shl);
			
 
				+            */
			
 
				+
			
 
				+            // This is the SWAR (SIMD within in a register) restoring division algorithm.
			
 
				+            // This combines several ideas of the above algorithms:
			
 
				+            //  - If `duo` is shifted left instead of shifting `div` right like in the 3 instruction
			
 
				+            //    restoring division algorithm, some architectures can do the shifting and
			
 
				+            //    subtraction step in one instruction.
			
 
				+            //  - `quo` can be constructed by adding powers-of-two to it or shifting it left by one
			
 
				+            //    and adding one.
			
 
				+            //  - Every time `duo` is shifted left, there is another unused 0 bit shifted into the
			
 
				+            //    LSB, so what if we use those bits to store `quo`?
			
 
				+            // Through a complex setup, it is possible to manage `duo` and `quo` in the same
			
 
				+            // register, and perform one step with 2 or 3 instructions. The only major downsides are
			
 
				+            // that there is significant setup (it is only saves instructions if `shl` is
			
 
				+            // approximately more than 4), `duo < div_original` checks are impractical once SWAR is
			
 
				+            // initiated, and the number of division steps taken has to be exact (we cannot do more
			
 
				+            // division steps than `shl`, because it introduces edge cases where quotient bits in
			
 
				+            // `duo` start to collide with the real part of `div`.
			
 
				+            /*
			
 
				+            // first step. The quotient bit is stored in `quo` for now
			
 
				+            let div_original = div;
			
 
				+            let mut shl = $normalization_shift(duo, div, true);
			
 
				+            let mut div: $uX = (div << shl);
			
 
				+            duo = duo.wrapping_sub(div);
			
 
				+            let mut quo: $uX = 1 << shl;
			
 
				+            if duo < div_original {
			
 
				+                return (quo, duo);
			
 
				+            }
			
 
				+
			
 
				+            let mask: $uX;
			
 
				+            if (div as $iX) < 0 {
			
 
				+                // deal with same edge case as the 3 instruction restoring division algorithm, but
			
 
				+                // the quotient bit from this step also has to be stored in `quo`
			
 
				+                div >>= 1;
			
 
				+                shl -= 1;
			
 
				+                let tmp = 1 << shl;
			
 
				+                mask = tmp - 1;
			
 
				+                let sub = duo.wrapping_sub(div);
			
 
				+                if (sub as $iX) >= 0 {
			
 
				+                    // restore
			
 
				+                    duo = sub;
			
 
				+                    quo |= tmp;
			
 
				+                }
			
 
				+                if duo < div_original {
			
 
				+                    return (quo, duo);
			
 
				+                }
			
 
				+            } else {
			
 
				+                mask = quo - 1;
			
 
				+            }
			
 
				+            // There is now room for quotient bits in `duo`.
			
 
				+
			
 
				+            // Note that `div` is already shifted left and has `shl` unset bits. We subtract 1 from
			
 
				+            // `div` and end up with the subset of `shl` bits being all being set. This subset acts
			
 
				+            // just like a two's complement negative one. The subset of `div` containing the divisor
			
 
				+            // had 1 subtracted from it, but a carry will always be generated from the `shl` subset
			
 
				+            // as long as the quotient stays positive.
			
 
				+            //
			
 
				+            // When the modified `div` is subtracted from `duo.wrapping_shl(1)`, the `shl` subset
			
 
				+            // adds a quotient bit to the least significant bit.
			
 
				+            // For example, 89 (0b01011001) divided by 3 (0b11):
			
 
				+            //
			
 
				+            // shl:4, div:0b00110000
			
 
				+            // first step:
			
 
				+            //       duo:0b01011001
			
 
				+            // + div_neg:0b11010000
			
 
				+            // ____________________
			
 
				+            //           0b00101001
			
 
				+            // quo is set to 0b00010000 and mask is set to 0b00001111 for later
			
 
				+            //
			
 
				+            // 1 is subtracted from `div`. I will differentiate the `shl` part of `div` and the
			
 
				+            // quotient part of `duo` with `^`s.
			
 
				+            // chars.
			
 
				+            //     div:0b00110000
			
 
				+            //               ^^^^
			
 
				+            //   +     0b11111111
			
 
				+            //   ________________
			
 
				+            //         0b00101111
			
 
				+            //               ^^^^
			
 
				+            // div_neg:0b11010001
			
 
				+            //
			
 
				+            // first SWAR step:
			
 
				+            //  duo_shl1:0b01010010
			
 
				+            //                    ^
			
 
				+            // + div_neg:0b11010001
			
 
				+            // ____________________
			
 
				+            //           0b00100011
			
 
				+            //                    ^
			
 
				+            // second:
			
 
				+            //  duo_shl1:0b01000110
			
 
				+            //                   ^^
			
 
				+            // + div_neg:0b11010001
			
 
				+            // ____________________
			
 
				+            //           0b00010111
			
 
				+            //                   ^^
			
 
				+            // third:
			
 
				+            //  duo_shl1:0b00101110
			
 
				+            //                  ^^^
			
 
				+            // + div_neg:0b11010001
			
 
				+            // ____________________
			
 
				+            //           0b11111111
			
 
				+            //                  ^^^
			
 
				+            // 3 steps resulted in the quotient with 3 set bits as expected, but currently the real
			
 
				+            // part of `duo` is negative and the third step was an unnormalized step. The restore
			
 
				+            // branch then restores `duo`. Note that the restore branch does not shift `duo` left.
			
 
				+            //
			
 
				+            //   duo:0b11111111
			
 
				+            //              ^^^
			
 
				+            // + div:0b00101111
			
 
				+            //             ^^^^
			
 
				+            // ________________
			
 
				+            //       0b00101110
			
 
				+            //              ^^^
			
 
				+            // `duo` is now back in the `duo_shl1` state it was at in the the third step, with an
			
 
				+            // unset quotient bit.
			
 
				+            //
			
 
				+            // final step (`shl` was 4, so exactly 4 steps must be taken)
			
 
				+            //  duo_shl1:0b01011100
			
 
				+            //                 ^^^^
			
 
				+            // + div_neg:0b11010001
			
 
				+            // ____________________
			
 
				+            //           0b00101101
			
 
				+            //                 ^^^^
			
 
				+            // The quotient includes the `^` bits added with the `quo` bits from the beginning that
			
 
				+            // contained the first step and potential edge case step,
			
 
				+            // `quo:0b00010000 + (duo:0b00101101 & mask:0b00001111) == 0b00011101 == 29u8`.
			
 
				+            // The remainder is the bits remaining in `duo` that are not part of the quotient bits,
			
 
				+            // `duo:0b00101101 >> shl == 0b0010 == 2u8`.
			
 
				+            let div: $uX = div.wrapping_sub(1);
			
 
				+            let mut i = shl;
			
 
				+            loop {
			
 
				+                if i == 0 {
			
 
				+                    break;
			
 
				+                }
			
 
				+                i -= 1;
			
 
				+                duo = duo.wrapping_shl(1).wrapping_sub(div);
			
 
				+                if (duo as $iX) < 0 {
			
 
				+                    // restore
			
 
				+                    duo = duo.wrapping_add(div);
			
 
				+                }
			
 
				+            }
			
 
				+            // unpack the results of SWAR
			
 
				+            return ((duo & mask) | quo, duo >> shl);
			
 
				+            */
			
 
				+
			
 
				+            // The problem with the conditional restoring SWAR algorithm above is that, in practice,
			
 
				+            // it requires assembly code to bring out its full unrolled potential (It seems that
			
 
				+            // LLVM can't use unrolled conditionals optimally and ends up erasing all the benefit
			
 
				+            // that my algorithm intends. On architectures without predicated instructions, the code
			
 
				+            // gen is especially bad. We need a default software division algorithm that is
			
 
				+            // guaranteed to get decent code gen for the central loop.
			
 
				+
			
 
				+            // For non-SWAR algorithms, there is a way to do binary long division without
			
 
				+            // predication or even branching. This involves creating a mask from the sign bit and
			
 
				+            // performing different kinds of steps using that.
			
 
				+            /*
			
 
				+            let shl = $normalization_shift(duo, div, true);
			
 
				+            let mut div: $uX = div << shl;
			
 
				+            let mut pow: $uX = 1 << shl;
			
 
				+            let mut quo: $uX = 0;
			
 
				+            loop {
			
 
				+                let sub = duo.wrapping_sub(div);
			
 
				+                let sign_mask = !((sub as $iX).wrapping_shr($n - 1) as $uX);
			
 
				+                duo -= div & sign_mask;
			
 
				+                quo |= pow & sign_mask;
			
 
				+                div >>= 1;
			
 
				+                pow >>= 1;
			
 
				+                if pow == 0 {
			
 
				+                    break;
			
 
				+                }
			
 
				+            }
			
 
				+            return (quo, duo);
			
 
				+            */
			
 
				+            // However, it requires about 4 extra operations (smearing the sign bit, negating the
			
 
				+            // mask, and applying the mask twice) on top of the operations done by the actual
			
 
				+            // algorithm. With SWAR however, just 2 extra operations are needed, making it
			
 
				+            // practical and even the most optimal algorithm for some architectures.
			
 
				+
			
 
				+            // What we do is use custom assembly for predicated architectures that need software
			
 
				+            // division, and for the default algorithm use a mask based restoring SWAR algorithm
			
 
				+            // without conditionals or branches. On almost all architectures, this Rust code is
			
 
				+            // guaranteed to compile down to 5 assembly instructions or less for each step, and LLVM
			
 
				+            // will unroll it in a decent way.
			
 
				+
			
 
				+            // standard opening for SWAR algorithm with first step and edge case handling
			
 
				+            let div_original = div;
			
 
				+            let mut shl = $normalization_shift(duo, div, true);
			
 
				+            let mut div: $uX = (div << shl);
			
 
				+            duo = duo.wrapping_sub(div);
			
 
				+            let mut quo: $uX = 1 << shl;
			
 
				+            if duo < div_original {
			
 
				+                return (quo, duo);
			
 
				+            }
			
 
				+            let mask: $uX;
			
 
				+            if (div as $iX) < 0 {
			
 
				+                div >>= 1;
			
 
				+                shl -= 1;
			
 
				+                let tmp = 1 << shl;
			
 
				+                mask = tmp - 1;
			
 
				+                let sub = duo.wrapping_sub(div);
			
 
				+                if (sub as $iX) >= 0 {
			
 
				+                    duo = sub;
			
 
				+                    quo |= tmp;
			
 
				+                }
			
 
				+                if duo < div_original {
			
 
				+                    return (quo, duo);
			
 
				+                }
			
 
				+            } else {
			
 
				+                mask = quo - 1;
			
 
				+            }
			
 
				+
			
 
				+            // central loop
			
 
				+            div = div.wrapping_sub(1);
			
 
				+            let mut i = shl;
			
 
				+            loop {
			
 
				+                if i == 0 {
			
 
				+                    break
			
 
				+                }
			
 
				+                i -= 1;
			
 
				+                // shift left 1 and subtract
			
 
				+                duo = duo.wrapping_shl(1).wrapping_sub(div);
			
 
				+                // create mask
			
 
				+                let mask = (duo as $iX).wrapping_shr($n - 1) as $uX;
			
 
				+                // restore
			
 
				+                duo = duo.wrapping_add(div & mask);
			
 
				+            }
			
 
				+            // unpack
			
 
				+            return ((duo & mask) | quo, duo >> shl);
			
 
				+
			
 
				+            // miscellanious binary long division algorithms that might be better for specific
			
 
				+            // architectures
			
 
				+
			
 
				+            // Another kind of long division uses an interesting fact that `div` and `pow` can be
			
 
				+            // negated when `duo` is negative to perform a "negated" division step that works in
			
 
				+            // place of any normalization mechanism. This is a non-restoring division algorithm that
			
 
				+            // is very similar to the non-restoring division algorithms that can be found on the
			
 
				+            // internet, except there is only one test for `duo < 0`. The subtraction from `quo` can
			
 
				+            // be viewed as shifting the least significant set bit right (e.x. if we enter a series
			
 
				+            // of negated binary long division steps starting with `quo == 0b1011_0000` and
			
 
				+            // `pow == 0b0000_1000`, `quo` will progress like this: 0b1010_1000, 0b1010_0100,
			
 
				+            // 0b1010_0010, 0b1010_0001).
			
 
				+            /*
			
 
				+            let div_original = div;
			
 
				+            let shl = $normalization_shift(duo, div, true);
			
 
				+            let mut div: $uX = (div << shl);
			
 
				+            let mut pow: $uX = 1 << shl;
			
 
				+            let mut quo: $uX = pow;
			
 
				+            duo = duo.wrapping_sub(div);
			
 
				+            if duo < div_original {
			
 
				+                return (quo, duo);
			
 
				+            }
			
 
				+            div >>= 1;
			
 
				+            pow >>= 1;
			
 
				+            loop {
			
 
				+                if (duo as $iX) < 0 {
			
 
				+                    // Negated binary long division step.
			
 
				+                    duo = duo.wrapping_add(div);
			
 
				+                    quo = quo.wrapping_sub(pow);
			
 
				+                } else {
			
 
				+                    // Normal long division step.
			
 
				+                    if duo < div_original {
			
 
				+                        return (quo, duo)
			
 
				+                    }
			
 
				+                    duo = duo.wrapping_sub(div);
			
 
				+                    quo = quo.wrapping_add(pow);
			
 
				+                }
			
 
				+                pow >>= 1;
			
 
				+                div >>= 1;
			
 
				+            }
			
 
				+            */
			
 
				+
			
 
				+            // This is the Nonrestoring SWAR algorithm, combining the nonrestoring algorithm with
			
 
				+            // SWAR techniques that makes the only difference between steps be negation of `div`.
			
 
				+            // If there was an architecture with an instruction that negated inputs to an adder
			
 
				+            // based on conditionals, and in place shifting (or a three input addition operation
			
 
				+            // that can have `duo` as two of the inputs to effectively shift it left by 1), then a
			
 
				+            // single instruction central loop is possible. Microarchitectures often have inputs to
			
 
				+            // their ALU that can invert the arguments and carry in of adders, but the architectures
			
 
				+            // unfortunately do not have an instruction to dynamically invert this input based on
			
 
				+            // conditionals.
			
 
				+            /*
			
 
				+            // SWAR opening
			
 
				+            let div_original = div;
			
 
				+            let mut shl = $normalization_shift(duo, div, true);
			
 
				+            let mut div: $uX = (div << shl);
			
 
				+            duo = duo.wrapping_sub(div);
			
 
				+            let mut quo: $uX = 1 << shl;
			
 
				+            if duo < div_original {
			
 
				+                return (quo, duo);
			
 
				+            }
			
 
				+            let mask: $uX;
			
 
				+            if (div as $iX) < 0 {
			
 
				+                div >>= 1;
			
 
				+                shl -= 1;
			
 
				+                let tmp = 1 << shl;
			
 
				+                let sub = duo.wrapping_sub(div);
			
 
				+                if (sub as $iX) >= 0 {
			
 
				+                    // restore
			
 
				+                    duo = sub;
			
 
				+                    quo |= tmp;
			
 
				+                }
			
 
				+                if duo < div_original {
			
 
				+                    return (quo, duo);
			
 
				+                }
			
 
				+                mask = tmp - 1;
			
 
				+            } else {
			
 
				+                mask = quo - 1;
			
 
				+            }
			
 
				+
			
 
				+            // central loop
			
 
				+            let div: $uX = div.wrapping_sub(1);
			
 
				+            let mut i = shl;
			
 
				+            loop {
			
 
				+                if i == 0 {
			
 
				+                    break;
			
 
				+                }
			
 
				+                i -= 1;
			
 
				+                // note: the `wrapping_shl(1)` can be factored out, but would require another
			
 
				+                // restoring division step to prevent `(duo as $iX)` from overflowing
			
 
				+                if (duo as $iX) < 0 {
			
 
				+                    // Negated binary long division step.
			
 
				+                    duo = duo.wrapping_shl(1).wrapping_add(div);
			
 
				+                } else {
			
 
				+                    // Normal long division step.
			
 
				+                    duo = duo.wrapping_shl(1).wrapping_sub(div);
			
 
				+                }
			
 
				+            }
			
 
				+            if (duo as $iX) < 0 {
			
 
				+                // Restore. This was not needed in the original nonrestoring algorithm because of
			
 
				+                // the `duo < div_original` checks.
			
 
				+                duo = duo.wrapping_add(div);
			
 
				+            }
			
 
				+            // unpack
			
 
				+            return ((duo & mask) | quo, duo >> shl);
			
 
				+            */
			
 
				+        }
			
 
				+
			
 
				+        /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
			
 
				+        /// tuple.
			
 
				+        $(
			
 
				+            #[$signed_attr]
			
 
				+        )*
			
 
				+        pub fn $signed_name(duo: $iX, div: $iX) -> ($iX, $iX) {
			
 
				+            // There is a way of doing this without any branches, but requires too many extra
			
 
				+            // operations to be faster.
			
 
				+            /*
			
 
				+            let duo_s = duo >> ($n - 1);
			
 
				+            let div_s = div >> ($n - 1);
			
 
				+            let duo = (duo ^ duo_s).wrapping_sub(duo_s);
			
 
				+            let div = (div ^ div_s).wrapping_sub(div_s);
			
 
				+            let quo_s = duo_s ^ div_s;
			
 
				+            let rem_s = duo_s;
			
 
				+            let tmp = $unsigned_name(duo as $uX, div as $uX);
			
 
				+            (
			
 
				+                ((tmp.0 as $iX) ^ quo_s).wrapping_sub(quo_s),
			
 
				+                ((tmp.1 as $iX) ^ rem_s).wrapping_sub(rem_s),
			
 
				+            )
			
 
				+            */
			
 
				+
			
 
				+            match (duo < 0, div < 0) {
			
 
				+                (false, false) => {
			
 
				+                    let t = $unsigned_name(duo as $uX, div as $uX);
			
 
				+                    (t.0 as $iX, t.1 as $iX)
			
 
				+                },
			
 
				+                (true, false) => {
			
 
				+                    let t = $unsigned_name(duo.wrapping_neg() as $uX, div as $uX);
			
 
				+                    ((t.0 as $iX).wrapping_neg(), (t.1 as $iX).wrapping_neg())
			
 
				+                },
			
 
				+                (false, true) => {
			
 
				+                    let t = $unsigned_name(duo as $uX, div.wrapping_neg() as $uX);
			
 
				+                    ((t.0 as $iX).wrapping_neg(), t.1 as $iX)
			
 
				+                },
			
 
				+                (true, true) => {
			
 
				+                    let t = $unsigned_name(duo.wrapping_neg() as $uX, div.wrapping_neg() as $uX);
			
 
				+                    (t.0 as $iX, (t.1 as $iX).wrapping_neg())
			
 
				+                },
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+}
			
--- a/src/int/specialized_div_rem/delegate.rs
+++ b/src/int/specialized_div_rem/delegate.rs
@@ -0,0 +1,226 @@
 
				+/// Creates unsigned and signed division functions that use a combination of hardware division and
			
 
				+/// binary long division to divide integers larger than what hardware division by itself can do. This
			
 
				+/// function is intended for microarchitectures that have division hardware, but not fast enough
			
 
				+/// multiplication hardware for `impl_trifecta` to be faster.
			
 
				+#[macro_export]
			
 
				+macro_rules! impl_delegate {
			
 
				+    (
			
 
				+        $unsigned_name:ident, // name of the unsigned division function
			
 
				+        $signed_name:ident, // name of the signed division function
			
 
				+        $zero_div_fn:ident, // function called when division by zero is attempted
			
 
				+        $half_normalization_shift:ident, // function for finding the normalization shift of $uX
			
 
				+        $half_division:ident, // function for division of a $uX by a $uX
			
 
				+        $n_h:expr, // the number of bits in $iH or $uH
			
 
				+        $uH:ident, // unsigned integer with half the bit width of $uX
			
 
				+        $uX:ident, // unsigned integer with half the bit width of $uD.
			
 
				+        $uD:ident, // unsigned integer type for the inputs and outputs of `$unsigned_name`
			
 
				+        $iD:ident, // signed integer type for the inputs and outputs of `$signed_name`
			
 
				+        $($unsigned_attr:meta),*; // attributes for the unsigned function
			
 
				+        $($signed_attr:meta),* // attributes for the signed function
			
 
				+    ) => {
			
 
				+        /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
			
 
				+        /// tuple.
			
 
				+        $(
			
 
				+            #[$unsigned_attr]
			
 
				+        )*
			
 
				+        pub fn $unsigned_name(duo: $uD, div: $uD) -> ($uD, $uD) {
			
 
				+            // The two possibility algorithm, undersubtracting long division algorithm, or any kind
			
 
				+            // of reciprocal based algorithm will not be fastest, because they involve large
			
 
				+            // multiplications that we assume to not be fast enough relative to the divisions to
			
 
				+            // outweigh setup times.
			
 
				+
			
 
				+            // the number of bits in a $uX
			
 
				+            let n = $n_h * 2;
			
 
				+
			
 
				+            let duo_lo = duo as $uX;
			
 
				+            let duo_hi = (duo >> n) as $uX;
			
 
				+            let div_lo = div as $uX;
			
 
				+            let div_hi = (div >> n) as $uX;
			
 
				+
			
 
				+            match (div_lo == 0, div_hi == 0, duo_hi == 0) {
			
 
				+                (true, true, _) => {
			
 
				+                    $zero_div_fn()
			
 
				+                }
			
 
				+                (_, false, true) => {
			
 
				+                    // `duo` < `div`
			
 
				+                    return (0, duo)
			
 
				+                }
			
 
				+                (false, true, true) => {
			
 
				+                    // delegate to smaller division
			
 
				+                    let tmp = $half_division(duo_lo, div_lo);
			
 
				+                    return (tmp.0 as $uD, tmp.1 as $uD)
			
 
				+                }
			
 
				+                (false, true, false) => {
			
 
				+                    if duo_hi < div_lo {
			
 
				+                        // `quo_hi` will always be 0. This performs a binary long division algorithm
			
 
				+                        // to zero `duo_hi` followed by a half division.
			
 
				+
			
 
				+                        // We can calculate the normalization shift using only `$uX` size functions.
			
 
				+                        // If we calculated the normalization shift using
			
 
				+                        // `$half_normalization_shift(duo_hi, div_lo false)`, it would break the
			
 
				+                        // assumption the function has that the first argument is more than the
			
 
				+                        // second argument. If the arguments are switched, the assumption holds true
			
 
				+                        // since `duo_hi < div_lo`.
			
 
				+                        let norm_shift = $half_normalization_shift(div_lo, duo_hi, false);
			
 
				+                        let shl = if norm_shift == 0 {
			
 
				+                            // Consider what happens if the msbs of `duo_hi` and `div_lo` align with
			
 
				+                            // no shifting. The normalization shift will always return
			
 
				+                            // `norm_shift == 0` regardless of whether it is fully normalized,
			
 
				+                            // because `duo_hi < div_lo`. In that edge case, `n - norm_shift` would
			
 
				+                            // result in shift overflow down the line. For the edge case, because
			
 
				+                            // both `duo_hi < div_lo` and we are comparing all the significant bits
			
 
				+                            // of `duo_hi` and `div`, we can make `shl = n - 1`.
			
 
				+                            n - 1
			
 
				+                        } else {
			
 
				+                            // We also cannot just use `shl = n - norm_shift - 1` in the general
			
 
				+                            // case, because when we are not in the edge case comparing all the
			
 
				+                            // significant bits, then the full `duo < div` may not be true and thus
			
 
				+                            // breaks the division algorithm.
			
 
				+                            n - norm_shift
			
 
				+                        };
			
 
				+
			
 
				+                        // The 3 variable restoring division algorithm (see binary_long.rs) is ideal
			
 
				+                        // for this task, since `pow` and `quo` can be `$uX` and the delegation
			
 
				+                        // check is simple.
			
 
				+                        let mut div: $uD = div << shl;
			
 
				+                        let mut pow_lo: $uX = 1 << shl;
			
 
				+                        let mut quo_lo: $uX = 0;
			
 
				+                        let mut duo = duo;
			
 
				+                        loop {
			
 
				+                            let sub = duo.wrapping_sub(div);
			
 
				+                            if 0 <= (sub as $iD) {
			
 
				+                                duo = sub;
			
 
				+                                quo_lo |= pow_lo;
			
 
				+                                let duo_hi = (duo >> n) as $uX;
			
 
				+                                if duo_hi == 0 {
			
 
				+                                    // Delegate to get the rest of the quotient. Note that the
			
 
				+                                    // `div_lo` here is the original unshifted `div`.
			
 
				+                                    let tmp = $half_division(duo as $uX, div_lo);
			
 
				+                                    return ((quo_lo | tmp.0) as $uD, tmp.1 as $uD)
			
 
				+                                }
			
 
				+                            }
			
 
				+                            div >>= 1;
			
 
				+                            pow_lo >>= 1;
			
 
				+                        }
			
 
				+                    } else if duo_hi == div_lo {
			
 
				+                        // `quo_hi == 1`. This branch is cheap and helps with edge cases.
			
 
				+                        let tmp = $half_division(duo as $uX, div as $uX);
			
 
				+                        return ((1 << n) | (tmp.0 as $uD), tmp.1 as $uD)
			
 
				+                    } else {
			
 
				+                        // `div_lo < duo_hi`
			
 
				+                        // `rem_hi == 0`
			
 
				+                        if (div_lo >> $n_h) == 0 {
			
 
				+                            // Short division of $uD by a $uH, using $uX by $uX division
			
 
				+                            let div_0 = div_lo as $uH as $uX;
			
 
				+                            let (quo_hi, rem_3) = $half_division(duo_hi, div_0);
			
 
				+
			
 
				+                            let duo_mid =
			
 
				+                                ((duo >> $n_h) as $uH as $uX)
			
 
				+                                | (rem_3 << $n_h);
			
 
				+                            let (quo_1, rem_2) = $half_division(duo_mid, div_0);
			
 
				+
			
 
				+                            let duo_lo =
			
 
				+                                (duo as $uH as $uX)
			
 
				+                                | (rem_2 << $n_h);
			
 
				+                            let (quo_0, rem_1) = $half_division(duo_lo, div_0);
			
 
				+
			
 
				+                            return (
			
 
				+                                (quo_0 as $uD)
			
 
				+                                | ((quo_1 as $uD) << $n_h)
			
 
				+                                | ((quo_hi as $uD) << n),
			
 
				+                                rem_1 as $uD
			
 
				+                            )
			
 
				+                        }
			
 
				+
			
 
				+                        // This is basically a short division composed of a half division for the hi
			
 
				+                        // part, specialized 3 variable binary long division in the middle, and
			
 
				+                        // another half division for the lo part.
			
 
				+                        let duo_lo = duo as $uX;
			
 
				+                        let tmp = $half_division(duo_hi, div_lo);
			
 
				+                        let quo_hi = tmp.0;
			
 
				+                        let mut duo = (duo_lo as $uD) | ((tmp.1 as $uD) << n);
			
 
				+                        // This check is required to avoid breaking the long division below.
			
 
				+                        if duo < div {
			
 
				+                            return ((quo_hi as $uD) << n, duo);
			
 
				+                        }
			
 
				+
			
 
				+                        // The half division handled all shift alignments down to `n`, so this
			
 
				+                        // division can continue with a shift of `n - 1`.
			
 
				+                        let mut div: $uD = div << (n - 1);
			
 
				+                        let mut pow_lo: $uX = 1 << (n - 1);
			
 
				+                        let mut quo_lo: $uX = 0;
			
 
				+                        loop {
			
 
				+                            let sub = duo.wrapping_sub(div);
			
 
				+                            if 0 <= (sub as $iD) {
			
 
				+                                duo = sub;
			
 
				+                                quo_lo |= pow_lo;
			
 
				+                                let duo_hi = (duo >> n) as $uX;
			
 
				+                                if duo_hi == 0 {
			
 
				+                                    // Delegate to get the rest of the quotient. Note that the
			
 
				+                                    // `div_lo` here is the original unshifted `div`.
			
 
				+                                    let tmp = $half_division(duo as $uX, div_lo);
			
 
				+                                    return (
			
 
				+                                        (tmp.0) as $uD | (quo_lo as $uD) | ((quo_hi as $uD) << n),
			
 
				+                                        tmp.1 as $uD
			
 
				+                                    );
			
 
				+                                }
			
 
				+                            }
			
 
				+                            div >>= 1;
			
 
				+                            pow_lo >>= 1;
			
 
				+                        }
			
 
				+                    }
			
 
				+                }
			
 
				+                (_, false, false) => {
			
 
				+                    // Full $uD by $uD binary long division. `quo_hi` will always be 0.
			
 
				+                    if duo < div {
			
 
				+                        return (0, duo);
			
 
				+                    }
			
 
				+                    let div_original = div;
			
 
				+                    let shl = $half_normalization_shift(duo_hi, div_hi, false);
			
 
				+                    let mut duo = duo;
			
 
				+                    let mut div: $uD = div << shl;
			
 
				+                    let mut pow_lo: $uX = 1 << shl;
			
 
				+                    let mut quo_lo: $uX = 0;
			
 
				+                    loop {
			
 
				+                        let sub = duo.wrapping_sub(div);
			
 
				+                        if 0 <= (sub as $iD) {
			
 
				+                            duo = sub;
			
 
				+                            quo_lo |= pow_lo;
			
 
				+                            if duo < div_original {
			
 
				+                                return (quo_lo as $uD, duo)
			
 
				+                            }
			
 
				+                        }
			
 
				+                        div >>= 1;
			
 
				+                        pow_lo >>= 1;
			
 
				+                    }
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
			
 
				+        /// tuple.
			
 
				+        $(
			
 
				+            #[$signed_attr]
			
 
				+        )*
			
 
				+        pub fn $signed_name(duo: $iD, div: $iD) -> ($iD, $iD) {
			
 
				+            match (duo < 0, div < 0) {
			
 
				+                (false, false) => {
			
 
				+                    let t = $unsigned_name(duo as $uD, div as $uD);
			
 
				+                    (t.0 as $iD, t.1 as $iD)
			
 
				+                },
			
 
				+                (true, false) => {
			
 
				+                    let t = $unsigned_name(duo.wrapping_neg() as $uD, div as $uD);
			
 
				+                    ((t.0 as $iD).wrapping_neg(), (t.1 as $iD).wrapping_neg())
			
 
				+                },
			
 
				+                (false, true) => {
			
 
				+                    let t = $unsigned_name(duo as $uD, div.wrapping_neg() as $uD);
			
 
				+                    ((t.0 as $iD).wrapping_neg(), t.1 as $iD)
			
 
				+                },
			
 
				+                (true, true) => {
			
 
				+                    let t = $unsigned_name(duo.wrapping_neg() as $uD, div.wrapping_neg() as $uD);
			
 
				+                    (t.0 as $iD, (t.1 as $iD).wrapping_neg())
			
 
				+                },
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+}
			
--- a/src/int/specialized_div_rem/mod.rs
+++ b/src/int/specialized_div_rem/mod.rs
@@ -0,0 +1,314 @@
 
				+// TODO: when `unsafe_block_in_unsafe_fn` is stabilized, remove this
			
 
				+#![allow(unused_unsafe)]
			
 
				+
			
 
				+//! This `specialized_div_rem` module is originally from version 1.0.0 of the
			
 
				+//! `specialized-div-rem` crate. Note that `for` loops with ranges are not used in this
			
 
				+//! module, since unoptimized compilation may generate references to `memcpy`.
			
 
				+//!
			
 
				+//! The purpose of these macros is to easily change the both the division algorithm used
			
 
				+//! for a given integer size and the half division used by that algorithm. The way
			
 
				+//! functions call each other is also constructed such that linkers will find the chain of
			
 
				+//! software and hardware divisions needed for every size of signed and unsigned division.
			
 
				+//! For example, most target compilations do the following:
			
 
				+//!
			
 
				+//!  - Many 128 bit division functions like `u128::wrapping_div` use
			
 
				+//!    `std::intrinsics::unchecked_div`, which gets replaced by `__udivti3` because there
			
 
				+//!    is not a 128 bit by 128 bit hardware division function in most architectures.
			
 
				+//!    `__udivti3` uses `u128_div_rem` (this extra level of function calls exists because
			
 
				+//!    `__umodti3` and `__udivmodti4` also exist, and `specialized_div_rem` supplies just
			
 
				+//!    one function to calculate both the quotient and remainder. If configuration flags
			
 
				+//!    enable it, `impl_trifecta!` defines `u128_div_rem` to use the trifecta algorithm,
			
 
				+//!    which requires the half sized division `u64_by_u64_div_rem`. If the architecture
			
 
				+//!    supplies a 64 bit hardware division instruction, `u64_by_u64_div_rem` will be
			
 
				+//!    reduced to those instructions. Note that we do not specify the half size division
			
 
				+//!    directly to be `__udivdi3`, because hardware division would never be introduced.
			
 
				+//!  - If the architecture does not supply a 64 bit hardware division instruction, u64
			
 
				+//!    divisions will use functions such as `__udivdi3`. This will call `u64_div_rem`
			
 
				+//!    which is defined by `impl_delegate!`. The half division for this algorithm is
			
 
				+//!    `u32_by_u32_div_rem` which in turn becomes hardware division instructions or more
			
 
				+//!    software division algorithms.
			
 
				+//!  - If the architecture does not supply a 32 bit hardware instruction, linkers will
			
 
				+//!    look for `__udivsi3`. `impl_binary_long!` is used, but this  algorithm uses no half
			
 
				+//!    division, so the chain of calls ends here.
			
 
				+//!
			
 
				+//! On some architectures like x86_64, an asymmetrically sized division is supplied, in
			
 
				+//! which 128 bit numbers can be divided by 64 bit numbers. `impl_asymmetric!` is used to
			
 
				+//! extend the 128 by 64 bit division to a full 128 by 128 bit division.
			
 
				+
			
 
				+// `allow(dead_code)` is used in various places, because the configuration code would otherwise be
			
 
				+// ridiculously complex
			
 
				+
			
 
				+#[macro_use]
			
 
				+mod norm_shift;
			
 
				+
			
 
				+#[macro_use]
			
 
				+mod binary_long;
			
 
				+
			
 
				+#[macro_use]
			
 
				+mod delegate;
			
 
				+
			
 
				+#[macro_use]
			
 
				+mod trifecta;
			
 
				+
			
 
				+#[macro_use]
			
 
				+mod asymmetric;
			
 
				+
			
 
				+/// The behavior of all divisions by zero is controlled by this function. This function should be
			
 
				+/// impossible to reach by Rust users, unless `compiler-builtins` public division functions or
			
 
				+/// `core/std::unchecked_div/rem` are directly used without a zero check in front.
			
 
				+fn zero_div_fn() -> ! {
			
 
				+    unsafe { core::hint::unreachable_unchecked() }
			
 
				+}
			
 
				+
			
 
				+// The `B` extension on RISC-V determines if a CLZ assembly instruction exists
			
 
				+#[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))]
			
 
				+const USE_LZ: bool = cfg!(target_feature = "b");
			
 
				+
			
 
				+#[cfg(target_arch = "arm")]
			
 
				+const USE_LZ: bool = if cfg!(target_feature = "thumb-mode") {
			
 
				+    // ARM thumb targets have CLZ instructions if the instruction set of ARMv6T2 is supported. This
			
 
				+    // is needed to successfully differentiate between targets like `thumbv8.base` and
			
 
				+    // `thumbv8.main`.
			
 
				+    cfg!(target_feature = "v6t2")
			
 
				+} else {
			
 
				+    // Regular ARM targets have CLZ instructions if the ARMv5TE instruction set is supported.
			
 
				+    // Technically, ARMv5T was the first to have CLZ, but the "v5t" target feature does not seem to
			
 
				+    // work.
			
 
				+    cfg!(target_feature = "v5te")
			
 
				+};
			
 
				+
			
 
				+// All other targets Rust supports have CLZ instructions
			
 
				+#[cfg(not(any(target_arch = "arm", target_arch = "riscv32", target_arch = "riscv64")))]
			
 
				+const USE_LZ: bool = true;
			
 
				+
			
 
				+impl_normalization_shift!(
			
 
				+    u32_normalization_shift,
			
 
				+    USE_LZ,
			
 
				+    32,
			
 
				+    u32,
			
 
				+    i32,
			
 
				+    allow(dead_code)
			
 
				+);
			
 
				+impl_normalization_shift!(
			
 
				+    u64_normalization_shift,
			
 
				+    USE_LZ,
			
 
				+    64,
			
 
				+    u64,
			
 
				+    i64,
			
 
				+    allow(dead_code)
			
 
				+);
			
 
				+
			
 
				+/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
			
 
				+/// `checked_div` and `checked_rem` are used to avoid bringing in panic function
			
 
				+/// dependencies.
			
 
				+#[inline]
			
 
				+fn u64_by_u64_div_rem(duo: u64, div: u64) -> (u64, u64) {
			
 
				+    if let Some(quo) = duo.checked_div(div) {
			
 
				+        if let Some(rem) = duo.checked_rem(div) {
			
 
				+            return (quo, rem);
			
 
				+        }
			
 
				+    }
			
 
				+    zero_div_fn()
			
 
				+}
			
 
				+
			
 
				+// `inline(never)` is placed on unsigned division functions so that there are just three division
			
 
				+// functions (`u32_div_rem`, `u64_div_rem`, and `u128_div_rem`) backing all `compiler-builtins`
			
 
				+// division functions. The signed functions like `i32_div_rem` will get inlined into the
			
 
				+// `compiler-builtins` signed division functions, so that they directly call the three division
			
 
				+// functions. Otherwise, LLVM may try to inline the unsigned division functions 4 times into the
			
 
				+// signed division functions, which results in an explosion in code size.
			
 
				+
			
 
				+// Whether `trifecta` or `delegate` is faster for 128 bit division depends on the speed at which a
			
 
				+// microarchitecture can multiply and divide. We decide to be optimistic and assume `trifecta` is
			
 
				+// faster if the target pointer width is at least 64.
			
 
				+#[cfg(all(
			
 
				+    not(all(feature = "asm", target_arch = "x86_64")),
			
 
				+    not(any(target_pointer_width = "16", target_pointer_width = "32"))
			
 
				+))]
			
 
				+impl_trifecta!(
			
 
				+    u128_div_rem,
			
 
				+    i128_div_rem,
			
 
				+    zero_div_fn,
			
 
				+    u64_by_u64_div_rem,
			
 
				+    32,
			
 
				+    u32,
			
 
				+    u64,
			
 
				+    u128,
			
 
				+    i128,
			
 
				+    inline(never);
			
 
				+    inline
			
 
				+);
			
 
				+
			
 
				+// If the pointer width less than 64, then the target architecture almost certainly does not have
			
 
				+// the fast 64 to 128 bit widening multiplication needed for `trifecta` to be faster.
			
 
				+#[cfg(all(
			
 
				+    not(all(feature = "asm", target_arch = "x86_64")),
			
 
				+    any(target_pointer_width = "16", target_pointer_width = "32")
			
 
				+))]
			
 
				+impl_delegate!(
			
 
				+    u128_div_rem,
			
 
				+    i128_div_rem,
			
 
				+    zero_div_fn,
			
 
				+    u64_normalization_shift,
			
 
				+    u64_by_u64_div_rem,
			
 
				+    32,
			
 
				+    u32,
			
 
				+    u64,
			
 
				+    u128,
			
 
				+    i128,
			
 
				+    inline(never);
			
 
				+    inline
			
 
				+);
			
 
				+
			
 
				+/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
			
 
				+///
			
 
				+/// # Safety
			
 
				+///
			
 
				+/// If the quotient does not fit in a `u64`, a floating point exception occurs.
			
 
				+/// If `div == 0`, then a division by zero exception occurs.
			
 
				+#[cfg(all(feature = "asm", target_arch = "x86_64"))]
			
 
				+#[inline]
			
 
				+unsafe fn u128_by_u64_div_rem(duo: u128, div: u64) -> (u64, u64) {
			
 
				+    let duo_lo = duo as u64;
			
 
				+    let duo_hi = (duo >> 64) as u64;
			
 
				+    let quo: u64;
			
 
				+    let rem: u64;
			
 
				+    unsafe {
			
 
				+        // divides the combined registers rdx:rax (`duo` is split into two 64 bit parts to do this)
			
 
				+        // by `div`. The quotient is stored in rax and the remainder in rdx.
			
 
				+        asm!(
			
 
				+            "div {0}",
			
 
				+            in(reg) div,
			
 
				+            inlateout("rax") duo_lo => quo,
			
 
				+            inlateout("rdx") duo_hi => rem,
			
 
				+            options(pure, nomem, nostack)
			
 
				+        );
			
 
				+    }
			
 
				+    (quo, rem)
			
 
				+}
			
 
				+
			
 
				+// use `asymmetric` instead of `trifecta` on x86_64
			
 
				+#[cfg(all(feature = "asm", target_arch = "x86_64"))]
			
 
				+impl_asymmetric!(
			
 
				+    u128_div_rem,
			
 
				+    i128_div_rem,
			
 
				+    zero_div_fn,
			
 
				+    u64_by_u64_div_rem,
			
 
				+    u128_by_u64_div_rem,
			
 
				+    32,
			
 
				+    u32,
			
 
				+    u64,
			
 
				+    u128,
			
 
				+    i128,
			
 
				+    inline(never);
			
 
				+    inline
			
 
				+);
			
 
				+
			
 
				+/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
			
 
				+/// `checked_div` and `checked_rem` are used to avoid bringing in panic function
			
 
				+/// dependencies.
			
 
				+#[inline]
			
 
				+#[allow(dead_code)]
			
 
				+fn u32_by_u32_div_rem(duo: u32, div: u32) -> (u32, u32) {
			
 
				+    if let Some(quo) = duo.checked_div(div) {
			
 
				+        if let Some(rem) = duo.checked_rem(div) {
			
 
				+            return (quo, rem);
			
 
				+        }
			
 
				+    }
			
 
				+    zero_div_fn()
			
 
				+}
			
 
				+
			
 
				+// When not on x86 and the pointer width is not 64, use `delegate` since the division size is larger
			
 
				+// than register size.
			
 
				+#[cfg(all(
			
 
				+    not(all(feature = "asm", target_arch = "x86")),
			
 
				+    not(target_pointer_width = "64")
			
 
				+))]
			
 
				+impl_delegate!(
			
 
				+    u64_div_rem,
			
 
				+    i64_div_rem,
			
 
				+    zero_div_fn,
			
 
				+    u32_normalization_shift,
			
 
				+    u32_by_u32_div_rem,
			
 
				+    16,
			
 
				+    u16,
			
 
				+    u32,
			
 
				+    u64,
			
 
				+    i64,
			
 
				+    inline(never);
			
 
				+    inline
			
 
				+);
			
 
				+
			
 
				+// When not on x86 and the pointer width is 64, use `binary_long`.
			
 
				+#[cfg(all(
			
 
				+    not(all(feature = "asm", target_arch = "x86")),
			
 
				+    target_pointer_width = "64"
			
 
				+))]
			
 
				+impl_binary_long!(
			
 
				+    u64_div_rem,
			
 
				+    i64_div_rem,
			
 
				+    zero_div_fn,
			
 
				+    u64_normalization_shift,
			
 
				+    64,
			
 
				+    u64,
			
 
				+    i64,
			
 
				+    inline(never);
			
 
				+    inline
			
 
				+);
			
 
				+
			
 
				+/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
			
 
				+///
			
 
				+/// # Safety
			
 
				+///
			
 
				+/// If the quotient does not fit in a `u32`, a floating point exception occurs.
			
 
				+/// If `div == 0`, then a division by zero exception occurs.
			
 
				+#[cfg(all(feature = "asm", target_arch = "x86"))]
			
 
				+#[inline]
			
 
				+unsafe fn u64_by_u32_div_rem(duo: u64, div: u32) -> (u32, u32) {
			
 
				+    let duo_lo = duo as u32;
			
 
				+    let duo_hi = (duo >> 32) as u32;
			
 
				+    let quo: u32;
			
 
				+    let rem: u32;
			
 
				+    unsafe {
			
 
				+        // divides the combined registers rdx:rax (`duo` is split into two 32 bit parts to do this)
			
 
				+        // by `div`. The quotient is stored in rax and the remainder in rdx.
			
 
				+        asm!(
			
 
				+            "div {0}",
			
 
				+            in(reg) div,
			
 
				+            inlateout("rax") duo_lo => quo,
			
 
				+            inlateout("rdx") duo_hi => rem,
			
 
				+            options(pure, nomem, nostack)
			
 
				+        );
			
 
				+    }
			
 
				+    (quo, rem)
			
 
				+}
			
 
				+
			
 
				+// use `asymmetric` instead of `delegate` on x86
			
 
				+#[cfg(all(feature = "asm", target_arch = "x86"))]
			
 
				+impl_asymmetric!(
			
 
				+    u64_div_rem,
			
 
				+    i64_div_rem,
			
 
				+    zero_div_fn,
			
 
				+    u32_by_u32_div_rem,
			
 
				+    u64_by_u32_div_rem,
			
 
				+    16,
			
 
				+    u16,
			
 
				+    u32,
			
 
				+    u64,
			
 
				+    i64,
			
 
				+    inline(never);
			
 
				+    inline
			
 
				+);
			
 
				+
			
 
				+// 32 bits is the smallest division used by `compiler-builtins`, so we end with binary long division
			
 
				+impl_binary_long!(
			
 
				+    u32_div_rem,
			
 
				+    i32_div_rem,
			
 
				+    zero_div_fn,
			
 
				+    u32_normalization_shift,
			
 
				+    32,
			
 
				+    u32,
			
 
				+    i32,
			
 
				+    inline(never);
			
 
				+    inline
			
 
				+);
			
--- a/src/int/specialized_div_rem/norm_shift.rs
+++ b/src/int/specialized_div_rem/norm_shift.rs
@@ -0,0 +1,106 @@
 
				+/// Creates a function used by some division algorithms to compute the "normalization shift".
			
 
				+#[macro_export]
			
 
				+macro_rules! impl_normalization_shift {
			
 
				+    (
			
 
				+        $name:ident, // name of the normalization shift function
			
 
				+        // boolean for if `$uX::leading_zeros` should be used (if an architecture does not have a
			
 
				+        // hardware instruction for `usize::leading_zeros`, then this should be `true`)
			
 
				+        $use_lz:ident,
			
 
				+        $n:tt, // the number of bits in a $iX or $uX
			
 
				+        $uX:ident, // unsigned integer type for the inputs of `$name`
			
 
				+        $iX:ident, // signed integer type for the inputs of `$name`
			
 
				+        $($unsigned_attr:meta),* // attributes for the function
			
 
				+    ) => {
			
 
				+        /// Finds the shift left that the divisor `div` would need to be normalized for a binary
			
 
				+        /// long division step with the dividend `duo`. NOTE: This function assumes that these edge
			
 
				+        /// cases have been handled before reaching it:
			
 
				+        /// `
			
 
				+        /// if div == 0 {
			
 
				+        ///     panic!("attempt to divide by zero")
			
 
				+        /// }
			
 
				+        /// if duo < div {
			
 
				+        ///     return (0, duo)
			
 
				+        /// }
			
 
				+        /// `
			
 
				+        ///
			
 
				+        /// Normalization is defined as (where `shl` is the output of this function):
			
 
				+        /// `
			
 
				+        /// if duo.leading_zeros() != (div << shl).leading_zeros() {
			
 
				+        ///     // If the most significant bits of `duo` and `div << shl` are not in the same place,
			
 
				+        ///     // then `div << shl` has one more leading zero than `duo`.
			
 
				+        ///     assert_eq!(duo.leading_zeros() + 1, (div << shl).leading_zeros());
			
 
				+        ///     // Also, `2*(div << shl)` is not more than `duo` (otherwise the first division step
			
 
				+        ///     // would not be able to clear the msb of `duo`)
			
 
				+        ///     assert!(duo < (div << (shl + 1)));
			
 
				+        /// }
			
 
				+        /// if full_normalization {
			
 
				+        ///     // Some algorithms do not need "full" normalization, which means that `duo` is
			
 
				+        ///     // larger than `div << shl` when the most significant bits are aligned.
			
 
				+        ///     assert!((div << shl) <= duo);
			
 
				+        /// }
			
 
				+        /// `
			
 
				+        ///
			
 
				+        /// Note: If the software bisection algorithm is being used in this function, it happens
			
 
				+        /// that full normalization always occurs, so be careful that new algorithms are not
			
 
				+        /// invisibly depending on this invariant when `full_normalization` is set to `false`.
			
 
				+        $(
			
 
				+            #[$unsigned_attr]
			
 
				+        )*
			
 
				+        fn $name(duo: $uX, div: $uX, full_normalization: bool) -> usize {
			
 
				+            // We have to find the leading zeros of `div` to know where its msb (most significant
			
 
				+            // set bit) is to even begin binary long division. It is also good to know where the msb
			
 
				+            // of `duo` is so that useful work can be started instead of shifting `div` for all
			
 
				+            // possible quotients (many division steps are wasted if `duo.leading_zeros()` is large
			
 
				+            // and `div` starts out being shifted all the way to the msb). Aligning the msbs of
			
 
				+            // `div` and `duo` could be done by shifting `div` left by
			
 
				+            // `div.leading_zeros() - duo.leading_zeros()`, but some CPUs without division hardware
			
 
				+            // also do not have single instructions for calculating `leading_zeros`. Instead of
			
 
				+            // software doing two bisections to find the two `leading_zeros`, we do one bisection to
			
 
				+            // find `div.leading_zeros() - duo.leading_zeros()` without actually knowing either of
			
 
				+            // the leading zeros values.
			
 
				+
			
 
				+            let mut shl: usize;
			
 
				+            if $use_lz {
			
 
				+                shl = (div.leading_zeros() - duo.leading_zeros()) as usize;
			
 
				+                if full_normalization {
			
 
				+                    if duo < (div << shl) {
			
 
				+                        // when the msb of `duo` and `div` are aligned, the resulting `div` may be
			
 
				+                        // larger than `duo`, so we decrease the shift by 1.
			
 
				+                        shl -= 1;
			
 
				+                    }
			
 
				+                }
			
 
				+            } else {
			
 
				+                let mut test = duo;
			
 
				+                shl = 0usize;
			
 
				+                let mut lvl = $n >> 1;
			
 
				+                loop {
			
 
				+                    let tmp = test >> lvl;
			
 
				+                    // It happens that a final `duo < (div << shl)` check is not needed, because the
			
 
				+                    // `div <= tmp` check insures that the msb of `test` never passes the msb of
			
 
				+                    // `div`, and any set bits shifted off the end of `test` would still keep
			
 
				+                    // `div <= tmp` true.
			
 
				+                    if div <= tmp {
			
 
				+                        test = tmp;
			
 
				+                        shl += lvl;
			
 
				+                    }
			
 
				+                    // narrow down bisection
			
 
				+                    lvl >>= 1;
			
 
				+                    if lvl == 0 {
			
 
				+                        break
			
 
				+                    }
			
 
				+                }
			
 
				+            }
			
 
				+            // tests the invariants that should hold before beginning binary long division
			
 
				+            /*
			
 
				+            if full_normalization {
			
 
				+                assert!((div << shl) <= duo);
			
 
				+            }
			
 
				+            if duo.leading_zeros() != (div << shl).leading_zeros() {
			
 
				+                assert_eq!(duo.leading_zeros() + 1, (div << shl).leading_zeros());
			
 
				+                assert!(duo < (div << (shl + 1)));
			
 
				+            }
			
 
				+            */
			
 
				+            shl
			
 
				+        }
			
 
				+    }
			
 
				+}
			
--- a/src/int/specialized_div_rem/trifecta.rs
+++ b/src/int/specialized_div_rem/trifecta.rs
@@ -0,0 +1,441 @@
 
				+/// Creates unsigned and signed division functions optimized for division of integers with bitwidths
			
 
				+/// larger than the largest hardware integer division supported. These functions use large radix
			
 
				+/// division algorithms that require both fast division and very fast widening multiplication on the
			
 
				+/// target microarchitecture. Otherwise, `impl_delegate` should be used instead.
			
 
				+#[macro_export]
			
 
				+macro_rules! impl_trifecta {
			
 
				+    (
			
 
				+        $unsigned_name:ident, // name of the unsigned division function
			
 
				+        $signed_name:ident, // name of the signed division function
			
 
				+        $zero_div_fn:ident, // function called when division by zero is attempted
			
 
				+        $half_division:ident, // function for division of a $uX by a $uX
			
 
				+        $n_h:expr, // the number of bits in $iH or $uH
			
 
				+        $uH:ident, // unsigned integer with half the bit width of $uX
			
 
				+        $uX:ident, // unsigned integer with half the bit width of $uD
			
 
				+        $uD:ident, // unsigned integer type for the inputs and outputs of `$unsigned_name`
			
 
				+        $iD:ident, // signed integer type for the inputs and outputs of `$signed_name`
			
 
				+        $($unsigned_attr:meta),*; // attributes for the unsigned function
			
 
				+        $($signed_attr:meta),* // attributes for the signed function
			
 
				+    ) => {
			
 
				+        /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
			
 
				+        /// tuple.
			
 
				+        $(
			
 
				+            #[$unsigned_attr]
			
 
				+        )*
			
 
				+        pub fn $unsigned_name(duo: $uD, div: $uD) -> ($uD, $uD) {
			
 
				+            // This is called the trifecta algorithm because it uses three main algorithms: short
			
 
				+            // division for small divisors, the two possibility algorithm for large divisors, and an
			
 
				+            // undersubtracting long division algorithm for intermediate cases.
			
 
				+
			
 
				+            // This replicates `carrying_mul` (rust-lang rfc #2417). LLVM correctly optimizes this
			
 
				+            // to use a widening multiply to 128 bits on the relevant architectures.
			
 
				+            fn carrying_mul(lhs: $uX, rhs: $uX) -> ($uX, $uX) {
			
 
				+                let tmp = (lhs as $uD).wrapping_mul(rhs as $uD);
			
 
				+                (tmp as $uX, (tmp >> ($n_h * 2)) as $uX)
			
 
				+            }
			
 
				+            fn carrying_mul_add(lhs: $uX, mul: $uX, add: $uX) -> ($uX, $uX) {
			
 
				+                let tmp = (lhs as $uD).wrapping_mul(mul as $uD).wrapping_add(add as $uD);
			
 
				+                (tmp as $uX, (tmp >> ($n_h * 2)) as $uX)
			
 
				+            }
			
 
				+
			
 
				+            // the number of bits in a $uX
			
 
				+            let n = $n_h * 2;
			
 
				+
			
 
				+            if div == 0 {
			
 
				+                $zero_div_fn()
			
 
				+            }
			
 
				+
			
 
				+            // Trying to use a normalization shift function will cause inelegancies in the code and
			
 
				+            // inefficiencies for architectures with a native count leading zeros instruction. The
			
 
				+            // undersubtracting algorithm needs both values (keeping the original `div_lz` but
			
 
				+            // updating `duo_lz` multiple times), so we assume hardware support for fast
			
 
				+            // `leading_zeros` calculation.
			
 
				+            let div_lz = div.leading_zeros();
			
 
				+            let mut duo_lz = duo.leading_zeros();
			
 
				+
			
 
				+            // the possible ranges of `duo` and `div` at this point:
			
 
				+            // `0 <= duo < 2^n_d`
			
 
				+            // `1 <= div < 2^n_d`
			
 
				+
			
 
				+            // quotient is 0 or 1 branch
			
 
				+            if div_lz <= duo_lz {
			
 
				+                // The quotient cannot be more than 1. The highest set bit of `duo` needs to be at
			
 
				+                // least one place higher than `div` for the quotient to be more than 1.
			
 
				+                if duo >= div {
			
 
				+                    return (1, duo - div)
			
 
				+                } else {
			
 
				+                    return (0, duo)
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+            // `_sb` is the number of significant bits (from the ones place to the highest set bit)
			
 
				+            // `{2, 2^div_sb} <= duo < 2^n_d`
			
 
				+            // `1 <= div < {2^duo_sb, 2^(n_d - 1)}`
			
 
				+            // smaller division branch
			
 
				+            if duo_lz >= n {
			
 
				+                // `duo < 2^n` so it will fit in a $uX. `div` will also fit in a $uX (because of the
			
 
				+                // `div_lz <= duo_lz` branch) so no numerical error.
			
 
				+                let (quo, rem) = $half_division(duo as $uX, div as $uX);
			
 
				+                return (
			
 
				+                    quo as $uD,
			
 
				+                    rem as $uD
			
 
				+                )
			
 
				+            }
			
 
				+
			
 
				+            // `{2^n, 2^div_sb} <= duo < 2^n_d`
			
 
				+            // `1 <= div < {2^duo_sb, 2^(n_d - 1)}`
			
 
				+            // short division branch
			
 
				+            if div_lz >= (n + $n_h) {
			
 
				+                // `1 <= div < {2^duo_sb, 2^n_h}`
			
 
				+
			
 
				+                // It is barely possible to improve the performance of this by calculating the
			
 
				+                // reciprocal and removing one `$half_division`, but only if the CPU can do fast
			
 
				+                // multiplications in parallel. Other reciprocal based methods can remove two
			
 
				+                // `$half_division`s, but have multiplications that cannot be done in parallel and
			
 
				+                // reduce performance. I have decided to use this trivial short division method and
			
 
				+                // rely on the CPU having quick divisions.
			
 
				+
			
 
				+                let duo_hi = (duo >> n) as $uX;
			
 
				+                let div_0 = div as $uH as $uX;
			
 
				+                let (quo_hi, rem_3) = $half_division(duo_hi, div_0);
			
 
				+
			
 
				+                let duo_mid =
			
 
				+                    ((duo >> $n_h) as $uH as $uX)
			
 
				+                    | (rem_3 << $n_h);
			
 
				+                let (quo_1, rem_2) = $half_division(duo_mid, div_0);
			
 
				+
			
 
				+                let duo_lo =
			
 
				+                    (duo as $uH as $uX)
			
 
				+                    | (rem_2 << $n_h);
			
 
				+                let (quo_0, rem_1) = $half_division(duo_lo, div_0);
			
 
				+
			
 
				+                return (
			
 
				+                    (quo_0 as $uD)
			
 
				+                    | ((quo_1 as $uD) << $n_h)
			
 
				+                    | ((quo_hi as $uD) << n),
			
 
				+                    rem_1 as $uD
			
 
				+                )
			
 
				+            }
			
 
				+
			
 
				+            // relative leading significant bits, cannot overflow because of above branches
			
 
				+            let lz_diff = div_lz - duo_lz;
			
 
				+
			
 
				+            // `{2^n, 2^div_sb} <= duo < 2^n_d`
			
 
				+            // `2^n_h <= div < {2^duo_sb, 2^(n_d - 1)}`
			
 
				+            // `mul` or `mul - 1` branch
			
 
				+            if lz_diff < $n_h {
			
 
				+                // Two possibility division algorithm
			
 
				+
			
 
				+                // The most significant bits of `duo` and `div` are within `$n_h` bits of each
			
 
				+                // other. If we take the `n` most significant bits of `duo` and divide them by the
			
 
				+                // corresponding bits in `div`, it produces a quotient value `quo`. It happens that
			
 
				+                // `quo` or `quo - 1` will always be the correct quotient for the whole number. In
			
 
				+                // other words, the bits less significant than the `n` most significant bits of
			
 
				+                // `duo` and `div` can only influence the quotient to be one of two values.
			
 
				+                // Because there are only two possibilities, there only needs to be one `$uH` sized
			
 
				+                // division, a `$uH` by `$uD` multiplication, and only one branch with a few simple
			
 
				+                // operations.
			
 
				+                //
			
 
				+                // Proof that the true quotient can only be `quo` or `quo - 1`.
			
 
				+                // All `/` operators here are floored divisions.
			
 
				+                //
			
 
				+                // `shift` is the number of bits not in the higher `n` significant bits of `duo`.
			
 
				+                // (definitions)
			
 
				+                // 0. shift = n - duo_lz
			
 
				+                // 1. duo_sig_n == duo / 2^shift
			
 
				+                // 2. div_sig_n == div / 2^shift
			
 
				+                // 3. quo == duo_sig_n / div_sig_n
			
 
				+                //
			
 
				+                //
			
 
				+                // We are trying to find the true quotient, `true_quo`.
			
 
				+                // 4. true_quo = duo / div. (definition)
			
 
				+                //
			
 
				+                // This is true because of the bits that are cut off during the bit shift.
			
 
				+                // 5. duo_sig_n * 2^shift <= duo < (duo_sig_n + 1) * 2^shift.
			
 
				+                // 6. div_sig_n * 2^shift <= div < (div_sig_n + 1) * 2^shift.
			
 
				+                //
			
 
				+                // Dividing each bound of (5) by each bound of (6) gives 4 possibilities for what
			
 
				+                // `true_quo == duo / div` is bounded by:
			
 
				+                // (duo_sig_n * 2^shift) / (div_sig_n * 2^shift)
			
 
				+                // (duo_sig_n * 2^shift) / ((div_sig_n + 1) * 2^shift)
			
 
				+                // ((duo_sig_n + 1) * 2^shift) / (div_sig_n * 2^shift)
			
 
				+                // ((duo_sig_n + 1) * 2^shift) / ((div_sig_n + 1) * 2^shift)
			
 
				+                //
			
 
				+                // Simplifying each of these four:
			
 
				+                // duo_sig_n / div_sig_n
			
 
				+                // duo_sig_n / (div_sig_n + 1)
			
 
				+                // (duo_sig_n + 1) / div_sig_n
			
 
				+                // (duo_sig_n + 1) / (div_sig_n + 1)
			
 
				+                //
			
 
				+                // Taking the smallest and the largest of these as the low and high bounds
			
 
				+                // and replacing `duo / div` with `true_quo`:
			
 
				+                // 7. duo_sig_n / (div_sig_n + 1) <= true_quo < (duo_sig_n + 1) / div_sig_n
			
 
				+                //
			
 
				+                // The `lz_diff < n_h` conditional on this branch makes sure that `div_sig_n` is at
			
 
				+                // least `2^n_h`, and the `div_lz <= duo_lz` branch makes sure that the highest bit
			
 
				+                // of `div_sig_n` is not the `2^(n - 1)` bit.
			
 
				+                // 8. `2^(n - 1) <= duo_sig_n < 2^n`
			
 
				+                // 9. `2^n_h <= div_sig_n < 2^(n - 1)`
			
 
				+                //
			
 
				+                // We want to prove that either
			
 
				+                // `(duo_sig_n + 1) / div_sig_n == duo_sig_n / (div_sig_n + 1)` or that
			
 
				+                // `(duo_sig_n + 1) / div_sig_n == duo_sig_n / (div_sig_n + 1) + 1`.
			
 
				+                //
			
 
				+                // We also want to prove that `quo` is one of these:
			
 
				+                // `duo_sig_n / div_sig_n == duo_sig_n / (div_sig_n + 1)` or
			
 
				+                // `duo_sig_n / div_sig_n == (duo_sig_n + 1) / div_sig_n`.
			
 
				+                //
			
 
				+                // When 1 is added to the numerator of `duo_sig_n / div_sig_n` to produce
			
 
				+                // `(duo_sig_n + 1) / div_sig_n`, it is not possible that the value increases by
			
 
				+                // more than 1 with floored integer arithmetic and `div_sig_n != 0`. Consider
			
 
				+                // `x/y + 1 < (x + 1)/y` <=> `x/y + 1 < x/y + 1/y` <=> `1 < 1/y` <=> `y < 1`.
			
 
				+                // `div_sig_n` is a nonzero integer. Thus,
			
 
				+                // 10. `duo_sig_n / div_sig_n == (duo_sig_n + 1) / div_sig_n` or
			
 
				+                //     `(duo_sig_n / div_sig_n) + 1 == (duo_sig_n + 1) / div_sig_n.
			
 
				+                //
			
 
				+                // When 1 is added to the denominator of `duo_sig_n / div_sig_n` to produce
			
 
				+                // `duo_sig_n / (div_sig_n + 1)`, it is not possible that the value decreases by
			
 
				+                // more than 1 with the bounds (8) and (9). Consider `x/y - 1 <= x/(y + 1)` <=>
			
 
				+                // `(x - y)/y < x/(y + 1)` <=> `(y + 1)*(x - y) < x*y` <=> `x*y - y*y + x - y < x*y`
			
 
				+                // <=> `x < y*y + y`. The smallest value of `div_sig_n` is `2^n_h` and the largest
			
 
				+                // value of `duo_sig_n` is `2^n - 1`. Substituting reveals `2^n - 1 < 2^n + 2^n_h`.
			
 
				+                // Thus,
			
 
				+                // 11. `duo_sig_n / div_sig_n == duo_sig_n / (div_sig_n + 1)` or
			
 
				+                //     `(duo_sig_n / div_sig_n) - 1` == duo_sig_n / (div_sig_n + 1)`
			
 
				+                //
			
 
				+                // Combining both (10) and (11), we know that
			
 
				+                // `quo - 1 <= duo_sig_n / (div_sig_n + 1) <= true_quo
			
 
				+                // < (duo_sig_n + 1) / div_sig_n <= quo + 1` and therefore:
			
 
				+                // 12. quo - 1 <= true_quo < quo + 1
			
 
				+                //
			
 
				+                // In a lot of division algorithms using smaller divisions to construct a larger
			
 
				+                // division, we often encounter a situation where the approximate `quo` value
			
 
				+                // calculated from a smaller division is multiple increments away from the true
			
 
				+                // `quo` value. In those algorithms, multiple correction steps have to be applied.
			
 
				+                // Those correction steps may need more multiplications to test `duo - (quo*div)`
			
 
				+                // again. Because of the fact that our `quo` can only be one of two values, we can
			
 
				+                // see if `duo - (quo*div)` overflows. If it did overflow, then we know that we have
			
 
				+                // the larger of the two values (since the true quotient is unique, and any larger
			
 
				+                // quotient will cause `duo - (quo*div)` to be negative). Also because there is only
			
 
				+                // one correction needed, we can calculate the remainder `duo - (true_quo*div) ==
			
 
				+                // duo - ((quo - 1)*div) == duo - (quo*div - div) == duo + div - quo*div`.
			
 
				+                // If `duo - (quo*div)` did not overflow, then we have the correct answer.
			
 
				+                let shift = n - duo_lz;
			
 
				+                let duo_sig_n = (duo >> shift) as $uX;
			
 
				+                let div_sig_n = (div >> shift) as $uX;
			
 
				+                let quo = $half_division(duo_sig_n, div_sig_n).0;
			
 
				+
			
 
				+                // The larger `quo` value can overflow `$uD` in the right circumstances. This is a
			
 
				+                // manual `carrying_mul_add` with overflow checking.
			
 
				+                let div_lo = div as $uX;
			
 
				+                let div_hi = (div >> n) as $uX;
			
 
				+                let (tmp_lo, carry) = carrying_mul(quo, div_lo);
			
 
				+                let (tmp_hi, overflow) = carrying_mul_add(quo, div_hi, carry);
			
 
				+                let tmp = (tmp_lo as $uD) | ((tmp_hi as $uD) << n);
			
 
				+                if (overflow != 0) || (duo < tmp) {
			
 
				+                    return (
			
 
				+                        (quo - 1) as $uD,
			
 
				+                        // Both the addition and subtraction can overflow, but when combined end up
			
 
				+                        // as a correct positive number.
			
 
				+                        duo.wrapping_add(div).wrapping_sub(tmp)
			
 
				+                    )
			
 
				+                } else {
			
 
				+                    return (
			
 
				+                        quo as $uD,
			
 
				+                        duo - tmp
			
 
				+                    )
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+            // Undersubtracting long division algorithm.
			
 
				+            // Instead of clearing a minimum of 1 bit from `duo` per iteration via binary long
			
 
				+            // division, `n_h - 1` bits are cleared per iteration with this algorithm. It is a more
			
 
				+            // complicated version of regular long division. Most integer division algorithms tend
			
 
				+            // to guess a part of the quotient, and may have a larger quotient than the true
			
 
				+            // quotient (which when multiplied by `div` will "oversubtract" the original dividend).
			
 
				+            // They then check if the quotient was in fact too large and then have to correct it.
			
 
				+            // This long division algorithm has been carefully constructed to always underguess the
			
 
				+            // quotient by slim margins. This allows different subalgorithms to be blindly jumped to
			
 
				+            // without needing an extra correction step.
			
 
				+            //
			
 
				+            // The only problem is that this subalgorithm will not work for many ranges of `duo` and
			
 
				+            // `div`. Fortunately, the short division, two possibility algorithm, and other simple
			
 
				+            // cases happen to exactly fill these gaps.
			
 
				+            //
			
 
				+            // For an example, consider the division of 76543210 by 213 and assume that `n_h` is
			
 
				+            // equal to two decimal digits (note: we are working with base 10 here for readability).
			
 
				+            // The first `sig_n_h` part of the divisor (21) is taken and is incremented by 1 to
			
 
				+            // prevent oversubtraction. We also record the number of extra places not a part of
			
 
				+            // the `sig_n` or `sig_n_h` parts.
			
 
				+            //
			
 
				+            // sig_n_h == 2 digits, sig_n == 4 digits
			
 
				+            //
			
 
				+            // vvvv     <- `duo_sig_n`
			
 
				+            // 76543210
			
 
				+            //     ^^^^ <- extra places in duo, `duo_extra == 4`
			
 
				+            //
			
 
				+            // vv  <- `div_sig_n_h`
			
 
				+            // 213
			
 
				+            //   ^ <- extra places in div, `div_extra == 1`
			
 
				+            //
			
 
				+            // The difference in extra places, `duo_extra - div_extra == extra_shl == 3`, is used
			
 
				+            // for shifting partial sums in the long division.
			
 
				+            //
			
 
				+            // In the first step, the first `sig_n` part of duo (7654) is divided by
			
 
				+            // `div_sig_n_h_add_1` (22), which results in a partial quotient of 347. This is
			
 
				+            // multiplied by the whole divisor to make 73911, which is shifted left by `extra_shl`
			
 
				+            // and subtracted from duo. The partial quotient is also shifted left by `extra_shl` to
			
 
				+            // be added to `quo`.
			
 
				+            //
			
 
				+            //    347
			
 
				+            //  ________
			
 
				+            // |76543210
			
 
				+            // -73911
			
 
				+            //   2632210
			
 
				+            //
			
 
				+            // Variables dependent on duo have to be updated:
			
 
				+            //
			
 
				+            // vvvv    <- `duo_sig_n == 2632`
			
 
				+            // 2632210
			
 
				+            //     ^^^ <- `duo_extra == 3`
			
 
				+            //
			
 
				+            // `extra_shl == 2`
			
 
				+            //
			
 
				+            // Two more steps are taken after this and then duo fits into `n` bits, and then a final
			
 
				+            // normal long division step is made. The partial quotients are all progressively added
			
 
				+            // to each other in the actual algorithm, but here I have left them all in a tower that
			
 
				+            // can be added together to produce the quotient, 359357.
			
 
				+            //
			
 
				+            //        14
			
 
				+            //       443
			
 
				+            //     119
			
 
				+            //    347
			
 
				+            //  ________
			
 
				+            // |76543210
			
 
				+            // -73911
			
 
				+            //   2632210
			
 
				+            //  -25347
			
 
				+            //     97510
			
 
				+            //    -94359
			
 
				+            //      3151
			
 
				+            //     -2982
			
 
				+            //       169 <- the remainder
			
 
				+
			
 
				+            let mut duo = duo;
			
 
				+            let mut quo: $uD = 0;
			
 
				+
			
 
				+            // The number of lesser significant bits not a part of `div_sig_n_h`
			
 
				+            let div_extra = (n + $n_h) - div_lz;
			
 
				+
			
 
				+            // The most significant `n_h` bits of div
			
 
				+            let div_sig_n_h = (div >> div_extra) as $uH;
			
 
				+
			
 
				+            // This needs to be a `$uX` in case of overflow from the increment
			
 
				+            let div_sig_n_h_add1 = (div_sig_n_h as $uX) + 1;
			
 
				+
			
 
				+            // `{2^n, 2^(div_sb + n_h)} <= duo < 2^n_d`
			
 
				+            // `2^n_h <= div < {2^(duo_sb - n_h), 2^n}`
			
 
				+            loop {
			
 
				+                // The number of lesser significant bits not a part of `duo_sig_n`
			
 
				+                let duo_extra = n - duo_lz;
			
 
				+
			
 
				+                // The most significant `n` bits of `duo`
			
 
				+                let duo_sig_n = (duo >> duo_extra) as $uX;
			
 
				+
			
 
				+                // the two possibility algorithm requires that the difference between msbs is less
			
 
				+                // than `n_h`, so the comparison is `<=` here.
			
 
				+                if div_extra <= duo_extra {
			
 
				+                    // Undersubtracting long division step
			
 
				+                    let quo_part = $half_division(duo_sig_n, div_sig_n_h_add1).0 as $uD;
			
 
				+                    let extra_shl = duo_extra - div_extra;
			
 
				+
			
 
				+                    // Addition to the quotient.
			
 
				+                    quo += (quo_part << extra_shl);
			
 
				+
			
 
				+                    // Subtraction from `duo`. At least `n_h - 1` bits are cleared from `duo` here.
			
 
				+                    duo -= (div.wrapping_mul(quo_part) << extra_shl);
			
 
				+                } else {
			
 
				+                    // Two possibility algorithm
			
 
				+                    let shift = n - duo_lz;
			
 
				+                    let duo_sig_n = (duo >> shift) as $uX;
			
 
				+                    let div_sig_n = (div >> shift) as $uX;
			
 
				+                    let quo_part = $half_division(duo_sig_n, div_sig_n).0;
			
 
				+                    let div_lo = div as $uX;
			
 
				+                    let div_hi = (div >> n) as $uX;
			
 
				+
			
 
				+                    let (tmp_lo, carry) = carrying_mul(quo_part, div_lo);
			
 
				+                    // The undersubtracting long division algorithm has already run once, so
			
 
				+                    // overflow beyond `$uD` bits is not possible here
			
 
				+                    let (tmp_hi, _) = carrying_mul_add(quo_part, div_hi, carry);
			
 
				+                    let tmp = (tmp_lo as $uD) | ((tmp_hi as $uD) << n);
			
 
				+
			
 
				+                    if duo < tmp {
			
 
				+                        return (
			
 
				+                            quo + ((quo_part - 1) as $uD),
			
 
				+                            duo.wrapping_add(div).wrapping_sub(tmp)
			
 
				+                        )
			
 
				+                    } else {
			
 
				+                        return (
			
 
				+                            quo + (quo_part as $uD),
			
 
				+                            duo - tmp
			
 
				+                        )
			
 
				+                    }
			
 
				+                }
			
 
				+
			
 
				+                duo_lz = duo.leading_zeros();
			
 
				+
			
 
				+                if div_lz <= duo_lz {
			
 
				+                    // quotient can have 0 or 1 added to it
			
 
				+                    if div <= duo {
			
 
				+                        return (
			
 
				+                            quo + 1,
			
 
				+                            duo - div
			
 
				+                        )
			
 
				+                    } else {
			
 
				+                        return (
			
 
				+                            quo,
			
 
				+                            duo
			
 
				+                        )
			
 
				+                    }
			
 
				+                }
			
 
				+
			
 
				+                // This can only happen if `div_sd < n` (because of previous "quo = 0 or 1"
			
 
				+                // branches), but it is not worth it to unroll further.
			
 
				+                if n <= duo_lz {
			
 
				+                    // simple division and addition
			
 
				+                    let tmp = $half_division(duo as $uX, div as $uX);
			
 
				+                    return (
			
 
				+                        quo + (tmp.0 as $uD),
			
 
				+                        tmp.1 as $uD
			
 
				+                    )
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
			
 
				+        /// tuple.
			
 
				+        $(
			
 
				+            #[$signed_attr]
			
 
				+        )*
			
 
				+        pub fn $signed_name(duo: $iD, div: $iD) -> ($iD, $iD) {
			
 
				+            match (duo < 0, div < 0) {
			
 
				+                (false, false) => {
			
 
				+                    let t = $unsigned_name(duo as $uD, div as $uD);
			
 
				+                    (t.0 as $iD, t.1 as $iD)
			
 
				+                },
			
 
				+                (true, false) => {
			
 
				+                    let t = $unsigned_name(duo.wrapping_neg() as $uD, div as $uD);
			
 
				+                    ((t.0 as $iD).wrapping_neg(), (t.1 as $iD).wrapping_neg())
			
 
				+                },
			
 
				+                (false, true) => {
			
 
				+                    let t = $unsigned_name(duo as $uD, div.wrapping_neg() as $uD);
			
 
				+                    ((t.0 as $iD).wrapping_neg(), t.1 as $iD)
			
 
				+                },
			
 
				+                (true, true) => {
			
 
				+                    let t = $unsigned_name(duo.wrapping_neg() as $uD, div.wrapping_neg() as $uD);
			
 
				+                    (t.0 as $iD, (t.1 as $iD).wrapping_neg())
			
 
				+                },
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+}
			
--- a/src/int/udiv.rs
+++ b/src/int/udiv.rs
@@ -1,270 +1,70 @@
 
				-use int::{Int, LargeInt};
			
 
				-
			
 
				-macro_rules! udivmod_inner {
			
 
				-    ($n:expr, $d:expr, $rem:expr, $ty:ty) => {{
			
 
				-        let (n, d, rem) = ($n, $d, $rem);
			
 
				-        // NOTE X is unknown, K != 0
			
 
				-        if n.high() == 0 {
			
 
				-            if d.high() == 0 {
			
 
				-                // 0 X
			
 
				-                // ---
			
 
				-                // 0 X
			
 
				-
			
 
				-                if let Some(rem) = rem {
			
 
				-                    *rem = <$ty>::from(n.low().aborting_rem(d.low()));
			
 
				-                }
			
 
				-                return <$ty>::from(n.low().aborting_div(d.low()))
			
 
				-            } else {
			
 
				-                // 0 X
			
 
				-                // ---
			
 
				-                // K X
			
 
				-                if let Some(rem) = rem {
			
 
				-                    *rem = n;
			
 
				-                }
			
 
				-                return 0;
			
 
				-            };
			
 
				-        }
			
 
				-
			
 
				-        let mut sr;
			
 
				-        let mut q;
			
 
				-        let mut r;
			
 
				-
			
 
				-        if d.low() == 0 {
			
 
				-            if d.high() == 0 {
			
 
				-                // K X
			
 
				-                // ---
			
 
				-                // 0 0
			
 
				-                // NOTE This should be unreachable in safe Rust because the program will panic before
			
 
				-                // this intrinsic is called
			
 
				-                ::abort();
			
 
				-            }
			
 
				-
			
 
				-            if n.low() == 0 {
			
 
				-                // K 0
			
 
				-                // ---
			
 
				-                // K 0
			
 
				-                if let Some(rem) = rem {
			
 
				-                    *rem = <$ty>::from_parts(0, n.high().aborting_rem(d.high()));
			
 
				-                }
			
 
				-                return <$ty>::from(n.high().aborting_div(d.high()))
			
 
				-            }
			
 
				-
			
 
				-            // K K
			
 
				-            // ---
			
 
				-            // K 0
			
 
				-
			
 
				-            if d.high().is_power_of_two() {
			
 
				-                if let Some(rem) = rem {
			
 
				-                    *rem = <$ty>::from_parts(n.low(), n.high() & (d.high() - 1));
			
 
				-                }
			
 
				-                return <$ty>::from(n.high() >> d.high().trailing_zeros());
			
 
				-            }
			
 
				-
			
 
				-            sr = d.high().leading_zeros().wrapping_sub(n.high().leading_zeros());
			
 
				-
			
 
				-            // D > N
			
 
				-            if sr > <hty!($ty)>::BITS - 2 {
			
 
				-                if let Some(rem) = rem {
			
 
				-                    *rem = n;
			
 
				-                }
			
 
				-                return 0;
			
 
				-            }
			
 
				-
			
 
				-            sr += 1;
			
 
				-
			
 
				-            // 1 <= sr <= <hty!($ty)>::BITS - 1
			
 
				-            q = n << (<$ty>::BITS - sr);
			
 
				-            r = n >> sr;
			
 
				-        } else if d.high() == 0 {
			
 
				-            // K X
			
 
				-            // ---
			
 
				-            // 0 K
			
 
				-            if d.low().is_power_of_two() {
			
 
				-                if let Some(rem) = rem {
			
 
				-                    *rem = <$ty>::from(n.low() & (d.low() - 1));
			
 
				-                }
			
 
				-
			
 
				-                if d.low() == 1 {
			
 
				-                    return n;
			
 
				-                } else {
			
 
				-                    let sr = d.low().trailing_zeros();
			
 
				-                    return n >> sr;
			
 
				-                };
			
 
				-            }
			
 
				-
			
 
				-            sr = 1 + <hty!($ty)>::BITS + d.low().leading_zeros() - n.high().leading_zeros();
			
 
				-
			
 
				-            // 2 <= sr <= u64::BITS - 1
			
 
				-            q = n << (<$ty>::BITS - sr);
			
 
				-            r = n >> sr;
			
 
				-        } else {
			
 
				-            // K X
			
 
				-            // ---
			
 
				-            // K K
			
 
				-            sr = d.high().leading_zeros().wrapping_sub(n.high().leading_zeros());
			
 
				-
			
 
				-            // D > N
			
 
				-            if sr > <hty!($ty)>::BITS - 1 {
			
 
				-                if let Some(rem) = rem {
			
 
				-                    *rem = n;
			
 
				-                }
			
 
				-                return 0;
			
 
				-            }
			
 
				-
			
 
				-            sr += 1;
			
 
				-
			
 
				-            // 1 <= sr <= <hty!($ty)>::BITS
			
 
				-            q = n << (<$ty>::BITS - sr);
			
 
				-            r = n >> sr;
			
 
				-        }
			
 
				-
			
 
				-        // Not a special case
			
 
				-        // q and r are initialized with
			
 
				-        // q = n << (u64::BITS - sr)
			
 
				-        // r = n >> sr
			
 
				-        // 1 <= sr <= u64::BITS - 1
			
 
				-        let mut carry = 0;
			
 
				-
			
 
				-        // Don't use a range because they may generate references to memcpy in unoptimized code
			
 
				-        let mut i = 0;
			
 
				-        while i < sr {
			
 
				-            i += 1;
			
 
				-
			
 
				-            // r:q = ((r:q) << 1) | carry
			
 
				-            r = (r << 1) | (q >> (<$ty>::BITS - 1));
			
 
				-            q = (q << 1) | carry as $ty;
			
 
				-
			
 
				-            // carry = 0
			
 
				-            // if r >= d {
			
 
				-            //     r -= d;
			
 
				-            //     carry = 1;
			
 
				-            // }
			
 
				-            let s = (d.wrapping_sub(r).wrapping_sub(1)) as os_ty!($ty) >> (<$ty>::BITS - 1);
			
 
				-            carry = (s & 1) as hty!($ty);
			
 
				-            r -= d & s as $ty;
			
 
				-        }
			
 
				-
			
 
				-        if let Some(rem) = rem {
			
 
				-            *rem = r;
			
 
				-        }
			
 
				-        (q << 1) | carry as $ty
			
 
				-    }}
			
 
				-}
			
 
				+use int::specialized_div_rem::*;
			
 
				 
			
 
				 intrinsics! {
			
 
				     #[maybe_use_optimized_c_shim]
			
 
				     #[arm_aeabi_alias = __aeabi_uidiv]
			
 
				     /// Returns `n / d`
			
 
				     pub extern "C" fn __udivsi3(n: u32, d: u32) -> u32 {
			
 
				-        // Special cases
			
 
				-        if d == 0 {
			
 
				-            // NOTE This should be unreachable in safe Rust because the program will panic before
			
 
				-            // this intrinsic is called
			
 
				-            ::abort();
			
 
				-        }
			
 
				-
			
 
				-        if n == 0 {
			
 
				-            return 0;
			
 
				-        }
			
 
				-
			
 
				-        let mut sr = d.leading_zeros().wrapping_sub(n.leading_zeros());
			
 
				-
			
 
				-        // d > n
			
 
				-        if sr > u32::BITS - 1 {
			
 
				-            return 0;
			
 
				-        }
			
 
				-
			
 
				-        // d == 1
			
 
				-        if sr == u32::BITS - 1 {
			
 
				-            return n;
			
 
				-        }
			
 
				-
			
 
				-        sr += 1;
			
 
				-
			
 
				-        // 1 <= sr <= u32::BITS - 1
			
 
				-        let mut q = n << (u32::BITS - sr);
			
 
				-        let mut r = n >> sr;
			
 
				-
			
 
				-        let mut carry = 0;
			
 
				-
			
 
				-        // Don't use a range because they may generate references to memcpy in unoptimized code
			
 
				-        let mut i = 0;
			
 
				-        while i < sr {
			
 
				-            i += 1;
			
 
				-
			
 
				-            // r:q = ((r:q) << 1) | carry
			
 
				-            r = (r << 1) | (q >> (u32::BITS - 1));
			
 
				-            q = (q << 1) | carry;
			
 
				-
			
 
				-            // carry = 0;
			
 
				-            // if r > d {
			
 
				-            //     r -= d;
			
 
				-            //     carry = 1;
			
 
				-            // }
			
 
				-
			
 
				-            let s = (d.wrapping_sub(r).wrapping_sub(1)) as i32 >> (u32::BITS - 1);
			
 
				-            carry = (s & 1) as u32;
			
 
				-            r -= d & s as u32;
			
 
				-        }
			
 
				-
			
 
				-        (q << 1) | carry
			
 
				+        u32_div_rem(n, d).0
			
 
				     }
			
 
				 
			
 
				     #[maybe_use_optimized_c_shim]
			
 
				     /// Returns `n % d`
			
 
				     pub extern "C" fn __umodsi3(n: u32, d: u32) -> u32 {
			
 
				-        let q = __udivsi3(n, d);
			
 
				-        n - q * d
			
 
				+        u32_div_rem(n, d).1
			
 
				     }
			
 
				 
			
 
				     #[maybe_use_optimized_c_shim]
			
 
				     /// Returns `n / d` and sets `*rem = n % d`
			
 
				     pub extern "C" fn __udivmodsi4(n: u32, d: u32, rem: Option<&mut u32>) -> u32 {
			
 
				-        let q = __udivsi3(n, d);
			
 
				+        let quo_rem = u32_div_rem(n, d);
			
 
				         if let Some(rem) = rem {
			
 
				-            *rem = n - (q * d);
			
 
				+            *rem = quo_rem.1;
			
 
				         }
			
 
				-        q
			
 
				+        quo_rem.0
			
 
				     }
			
 
				 
			
 
				     #[maybe_use_optimized_c_shim]
			
 
				     /// Returns `n / d`
			
 
				     pub extern "C" fn __udivdi3(n: u64, d: u64) -> u64 {
			
 
				-        __udivmoddi4(n, d, None)
			
 
				+        u64_div_rem(n, d).0
			
 
				     }
			
 
				 
			
 
				     #[maybe_use_optimized_c_shim]
			
 
				     /// Returns `n % d`
			
 
				     pub extern "C" fn __umoddi3(n: u64, d: u64) -> u64 {
			
 
				-        let mut rem = 0;
			
 
				-        __udivmoddi4(n, d, Some(&mut rem));
			
 
				-        rem
			
 
				+        u64_div_rem(n, d).1
			
 
				+    }
			
 
				+
			
 
				+    #[maybe_use_optimized_c_shim]
			
 
				+    /// Returns `n / d` and sets `*rem = n % d`
			
 
				+    pub extern "C" fn __udivmoddi4(n: u64, d: u64, rem: Option<&mut u64>) -> u64 {
			
 
				+        let quo_rem = u64_div_rem(n, d);
			
 
				+        if let Some(rem) = rem {
			
 
				+            *rem = quo_rem.1;
			
 
				+        }
			
 
				+        quo_rem.0
			
 
				     }
			
 
				 
			
 
				     #[win64_128bit_abi_hack]
			
 
				     /// Returns `n / d`
			
 
				     pub extern "C" fn __udivti3(n: u128, d: u128) -> u128 {
			
 
				-        __udivmodti4(n, d, None)
			
 
				+        u128_div_rem(n, d).0
			
 
				     }
			
 
				 
			
 
				     #[win64_128bit_abi_hack]
			
 
				     /// Returns `n % d`
			
 
				     pub extern "C" fn __umodti3(n: u128, d: u128) -> u128 {
			
 
				-        let mut rem = 0;
			
 
				-        __udivmodti4(n, d, Some(&mut rem));
			
 
				-        rem
			
 
				-    }
			
 
				-
			
 
				-    /// Returns `n / d` and sets `*rem = n % d`
			
 
				-    pub extern "C" fn __udivmoddi4(n: u64, d: u64, rem: Option<&mut u64>) -> u64 {
			
 
				-        udivmod_inner!(n, d, rem, u64)
			
 
				+        u128_div_rem(n, d).1
			
 
				     }
			
 
				 
			
 
				     #[win64_128bit_abi_hack]
			
 
				     /// Returns `n / d` and sets `*rem = n % d`
			
 
				-    pub extern "C" fn __udivmodti4(n: u128,
			
 
				-                                   d: u128,
			
 
				-                                   rem: Option<&mut u128>) -> u128 {
			
 
				-        udivmod_inner!(n, d, rem, u128)
			
 
				+    pub extern "C" fn __udivmodti4(n: u128, d: u128, rem: Option<&mut u128>) -> u128 {
			
 
				+        let quo_rem = u128_div_rem(n, d);
			
 
				+        if let Some(rem) = rem {
			
 
				+            *rem = quo_rem.1;
			
 
				+        }
			
 
				+        quo_rem.0
			
 
				     }
			
 
				 }
			
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,4 +1,5 @@
 
				 #![cfg_attr(feature = "compiler-builtins", compiler_builtins)]
			
 
				+#![cfg_attr(feature = "asm", feature(asm))]
			
 
				 #![feature(abi_unadjusted)]
			
 
				 #![feature(llvm_asm)]
			
 
				 #![feature(global_asm)]
			
--- a/testcrate/Cargo.toml
+++ b/testcrate/Cargo.toml
@@ -28,7 +28,8 @@ utest-cortex-m-qemu = { default-features = false, git = "https://github.com/japa
 
				 utest-macros = { git = "https://github.com/japaric/utest" }
			
 
				 
			
 
				 [features]
			
 
				+default = ["asm", "mangled-names"]
			
 
				+asm = ["compiler_builtins/asm"]
			
 
				 c = ["compiler_builtins/c"]
			
 
				 mem = ["compiler_builtins/mem"]
			
 
				 mangled-names = ["compiler_builtins/mangled-names"]
			
 
				-default = ["mangled-names"]
			
--- a/testcrate/build.rs
+++ b/testcrate/build.rs
@@ -805,6 +805,19 @@ fn main() {
 
				             (builtins::int::sdiv::__divmodsi4(a, b, &mut r), r)
			
 
				         }",
			
 
				     );
			
 
				+    gen(
			
 
				+        |(a, b): (MyI128, MyI128)| {
			
 
				+            if b.0 == 0 {
			
 
				+                None
			
 
				+            } else {
			
 
				+                Some((a.0 / b.0, a.0 % b.0))
			
 
				+            }
			
 
				+        },
			
 
				+        "{
			
 
				+            let mut r = 0;
			
 
				+            (builtins::int::sdiv::__divmodti4(a, b, &mut r), r)
			
 
				+        }",
			
 
				+    );
			
 
				     gen(
			
 
				         |(a, b): (MyI32, MyI32)| {
			
 
				             if b.0 == 0 {
			
--- a/testcrate/tests/div_rem.rs
+++ b/testcrate/tests/div_rem.rs
@@ -0,0 +1,137 @@
 
				+use rand_xoshiro::rand_core::{RngCore, SeedableRng};
			
 
				+use rand_xoshiro::Xoshiro128StarStar;
			
 
				+
			
 
				+use compiler_builtins::int::sdiv::{__divmoddi4, __divmodsi4, __divmodti4};
			
 
				+use compiler_builtins::int::udiv::{__udivmoddi4, __udivmodsi4, __udivmodti4};
			
 
				+
			
 
				+/// Creates intensive test functions for division functions of a certain size
			
 
				+macro_rules! test {
			
 
				+    (
			
 
				+        $n:expr, // the number of bits in a $iX or $uX
			
 
				+        $uX:ident, // unsigned integer that will be shifted
			
 
				+        $iX:ident, // signed version of $uX
			
 
				+        $test_name:ident, // name of the test function
			
 
				+        $unsigned_name:ident, // unsigned division function
			
 
				+        $signed_name:ident // signed division function
			
 
				+    ) => {
			
 
				+        #[test]
			
 
				+        fn $test_name() {
			
 
				+            fn assert_invariants(lhs: $uX, rhs: $uX) {
			
 
				+                let rem: &mut $uX = &mut 0;
			
 
				+                let quo: $uX = $unsigned_name(lhs, rhs, Some(rem));
			
 
				+                let rem = *rem;
			
 
				+                if rhs <= rem || (lhs != rhs.wrapping_mul(quo).wrapping_add(rem)) {
			
 
				+                    panic!(
			
 
				+                        "unsigned division function failed with lhs:{} rhs:{} \
			
 
				+                        expected:({}, {}) found:({}, {})",
			
 
				+                        lhs,
			
 
				+                        rhs,
			
 
				+                        lhs.wrapping_div(rhs),
			
 
				+                        lhs.wrapping_rem(rhs),
			
 
				+                        quo,
			
 
				+                        rem
			
 
				+                    );
			
 
				+                }
			
 
				+
			
 
				+                // test the signed division function also
			
 
				+                let lhs = lhs as $iX;
			
 
				+                let rhs = rhs as $iX;
			
 
				+                let mut rem: $iX = 0;
			
 
				+                let quo: $iX = $signed_name(lhs, rhs, &mut rem);
			
 
				+                // We cannot just test that
			
 
				+                // `lhs == rhs.wrapping_mul(quo).wrapping_add(rem)`, but also
			
 
				+                // need to make sure the remainder isn't larger than the divisor
			
 
				+                // and has the correct sign.
			
 
				+                let incorrect_rem = if rem == 0 {
			
 
				+                    false
			
 
				+                } else if rhs == $iX::MIN {
			
 
				+                    // `rhs.wrapping_abs()` would overflow, so handle this case
			
 
				+                    // separately.
			
 
				+                    (lhs.is_negative() != rem.is_negative()) || (rem == $iX::MIN)
			
 
				+                } else {
			
 
				+                    (lhs.is_negative() != rem.is_negative())
			
 
				+                        || (rhs.wrapping_abs() <= rem.wrapping_abs())
			
 
				+                };
			
 
				+                if incorrect_rem || lhs != rhs.wrapping_mul(quo).wrapping_add(rem) {
			
 
				+                    panic!(
			
 
				+                        "signed division function failed with lhs:{} rhs:{} \
			
 
				+                        expected:({}, {}) found:({}, {})",
			
 
				+                        lhs,
			
 
				+                        rhs,
			
 
				+                        lhs.wrapping_div(rhs),
			
 
				+                        lhs.wrapping_rem(rhs),
			
 
				+                        quo,
			
 
				+                        rem
			
 
				+                    );
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+            // Specially designed random fuzzer
			
 
				+            let mut rng = Xoshiro128StarStar::seed_from_u64(0);
			
 
				+            let mut lhs: $uX = 0;
			
 
				+            let mut rhs: $uX = 0;
			
 
				+            // all ones constant
			
 
				+            let ones: $uX = !0;
			
 
				+            // Alternating ones and zeros (e.x. 0b1010101010101010). This catches second-order
			
 
				+            // problems that might occur for algorithms with two modes of operation (potentially
			
 
				+            // there is some invariant that can be broken for large `duo` and maintained via
			
 
				+            // alternating between modes, breaking the algorithm when it reaches the end).
			
 
				+            let mut alt_ones: $uX = 1;
			
 
				+            for _ in 0..($n / 2) {
			
 
				+                alt_ones <<= 2;
			
 
				+                alt_ones |= 1;
			
 
				+            }
			
 
				+            // creates a mask for indexing the bits of the type
			
 
				+            let bit_indexing_mask = $n - 1;
			
 
				+            for _ in 0..1_000_000 {
			
 
				+                // Randomly OR, AND, and XOR randomly sized and shifted continuous strings of
			
 
				+                // ones with `lhs` and `rhs`. This results in excellent fuzzing entropy such as:
			
 
				+                // lhs:10101010111101000000000100101010 rhs: 1010101010000000000000001000001
			
 
				+                // lhs:10101010111101000000000101001010 rhs: 1010101010101010101010100010100
			
 
				+                // lhs:10101010111101000000000101001010 rhs:11101010110101010101010100001110
			
 
				+                // lhs:10101010000000000000000001001010 rhs:10100010100000000000000000001010
			
 
				+                // lhs:10101010000000000000000001001010 rhs:            10101010101010101000
			
 
				+                // lhs:10101010000000000000000001100000 rhs:11111111111101010101010101001111
			
 
				+                // lhs:10101010000000101010101011000000 rhs:11111111111101010101010100000111
			
 
				+                // lhs:10101010101010101010101011101010 rhs:             1010100000000000000
			
 
				+                // lhs:11111111110101101010101011010111 rhs:             1010100000000000000
			
 
				+                // The msb is set half of the time by the fuzzer, but `assert_invariants` tests
			
 
				+                // both the signed and unsigned functions.
			
 
				+                let r0: u32 = bit_indexing_mask & rng.next_u32();
			
 
				+                let r1: u32 = bit_indexing_mask & rng.next_u32();
			
 
				+                let mask = ones.wrapping_shr(r0).rotate_left(r1);
			
 
				+                match rng.next_u32() % 8 {
			
 
				+                    0 => lhs |= mask,
			
 
				+                    1 => lhs &= mask,
			
 
				+                    // both 2 and 3 to make XORs as common as ORs and ANDs combined, otherwise
			
 
				+                    // the entropy gets destroyed too often
			
 
				+                    2 | 3 => lhs ^= mask,
			
 
				+                    4 => rhs |= mask,
			
 
				+                    5 => rhs &= mask,
			
 
				+                    _ => rhs ^= mask,
			
 
				+                }
			
 
				+                // do the same for alternating ones and zeros
			
 
				+                let r0: u32 = bit_indexing_mask & rng.next_u32();
			
 
				+                let r1: u32 = bit_indexing_mask & rng.next_u32();
			
 
				+                let mask = alt_ones.wrapping_shr(r0).rotate_left(r1);
			
 
				+                match rng.next_u32() % 8 {
			
 
				+                    0 => lhs |= mask,
			
 
				+                    1 => lhs &= mask,
			
 
				+                    // both 2 and 3 to make XORs as common as ORs and ANDs combined, otherwise
			
 
				+                    // the entropy gets destroyed too often
			
 
				+                    2 | 3 => lhs ^= mask,
			
 
				+                    4 => rhs |= mask,
			
 
				+                    5 => rhs &= mask,
			
 
				+                    _ => rhs ^= mask,
			
 
				+                }
			
 
				+                if rhs != 0 {
			
 
				+                    assert_invariants(lhs, rhs);
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+    };
			
 
				+}
			
 
				+
			
 
				+test!(32, u32, i32, div_rem_si4, __udivmodsi4, __divmodsi4);
			
 
				+test!(64, u64, i64, div_rem_di4, __udivmoddi4, __divmoddi4);
			
 
				+test!(128, u128, i128, div_rem_ti4, __udivmodti4, __divmodti4);