delegate.rs 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319
  1. /// Creates an unsigned division function that uses a combination of hardware division and
  2. /// binary long division to divide integers larger than what hardware division by itself can do. This
  3. /// function is intended for microarchitectures that have division hardware, but not fast enough
  4. /// multiplication hardware for `impl_trifecta` to be faster.
  5. #[allow(unused_macros)]
  6. macro_rules! impl_delegate {
  7. (
  8. $fn:ident, // name of the unsigned division function
  9. $zero_div_fn:ident, // function called when division by zero is attempted
  10. $half_normalization_shift:ident, // function for finding the normalization shift of $uX
  11. $half_division:ident, // function for division of a $uX by a $uX
  12. $n_h:expr, // the number of bits in $iH or $uH
  13. $uH:ident, // unsigned integer with half the bit width of $uX
  14. $uX:ident, // unsigned integer with half the bit width of $uD.
  15. $uD:ident, // unsigned integer type for the inputs and outputs of `$fn`
  16. $iD:ident // signed integer type with the same bitwidth as `$uD`
  17. ) => {
  18. /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
  19. /// tuple.
  20. pub fn $fn(duo: $uD, div: $uD) -> ($uD, $uD) {
  21. // The two possibility algorithm, undersubtracting long division algorithm, or any kind
  22. // of reciprocal based algorithm will not be fastest, because they involve large
  23. // multiplications that we assume to not be fast enough relative to the divisions to
  24. // outweigh setup times.
  25. // the number of bits in a $uX
  26. let n = $n_h * 2;
  27. let duo_lo = duo as $uX;
  28. let duo_hi = (duo >> n) as $uX;
  29. let div_lo = div as $uX;
  30. let div_hi = (div >> n) as $uX;
  31. match (div_lo == 0, div_hi == 0, duo_hi == 0) {
  32. (true, true, _) => $zero_div_fn(),
  33. (_, false, true) => {
  34. // `duo` < `div`
  35. return (0, duo);
  36. }
  37. (false, true, true) => {
  38. // delegate to smaller division
  39. let tmp = $half_division(duo_lo, div_lo);
  40. return (tmp.0 as $uD, tmp.1 as $uD);
  41. }
  42. (false, true, false) => {
  43. if duo_hi < div_lo {
  44. // `quo_hi` will always be 0. This performs a binary long division algorithm
  45. // to zero `duo_hi` followed by a half division.
  46. // We can calculate the normalization shift using only `$uX` size functions.
  47. // If we calculated the normalization shift using
  48. // `$half_normalization_shift(duo_hi, div_lo false)`, it would break the
  49. // assumption the function has that the first argument is more than the
  50. // second argument. If the arguments are switched, the assumption holds true
  51. // since `duo_hi < div_lo`.
  52. let norm_shift = $half_normalization_shift(div_lo, duo_hi, false);
  53. let shl = if norm_shift == 0 {
  54. // Consider what happens if the msbs of `duo_hi` and `div_lo` align with
  55. // no shifting. The normalization shift will always return
  56. // `norm_shift == 0` regardless of whether it is fully normalized,
  57. // because `duo_hi < div_lo`. In that edge case, `n - norm_shift` would
  58. // result in shift overflow down the line. For the edge case, because
  59. // both `duo_hi < div_lo` and we are comparing all the significant bits
  60. // of `duo_hi` and `div`, we can make `shl = n - 1`.
  61. n - 1
  62. } else {
  63. // We also cannot just use `shl = n - norm_shift - 1` in the general
  64. // case, because when we are not in the edge case comparing all the
  65. // significant bits, then the full `duo < div` may not be true and thus
  66. // breaks the division algorithm.
  67. n - norm_shift
  68. };
  69. // The 3 variable restoring division algorithm (see binary_long.rs) is ideal
  70. // for this task, since `pow` and `quo` can be `$uX` and the delegation
  71. // check is simple.
  72. let mut div: $uD = div << shl;
  73. let mut pow_lo: $uX = 1 << shl;
  74. let mut quo_lo: $uX = 0;
  75. let mut duo = duo;
  76. loop {
  77. let sub = duo.wrapping_sub(div);
  78. if 0 <= (sub as $iD) {
  79. duo = sub;
  80. quo_lo |= pow_lo;
  81. let duo_hi = (duo >> n) as $uX;
  82. if duo_hi == 0 {
  83. // Delegate to get the rest of the quotient. Note that the
  84. // `div_lo` here is the original unshifted `div`.
  85. let tmp = $half_division(duo as $uX, div_lo);
  86. return ((quo_lo | tmp.0) as $uD, tmp.1 as $uD);
  87. }
  88. }
  89. div >>= 1;
  90. pow_lo >>= 1;
  91. }
  92. } else if duo_hi == div_lo {
  93. // `quo_hi == 1`. This branch is cheap and helps with edge cases.
  94. let tmp = $half_division(duo as $uX, div as $uX);
  95. return ((1 << n) | (tmp.0 as $uD), tmp.1 as $uD);
  96. } else {
  97. // `div_lo < duo_hi`
  98. // `rem_hi == 0`
  99. if (div_lo >> $n_h) == 0 {
  100. // Short division of $uD by a $uH, using $uX by $uX division
  101. let div_0 = div_lo as $uH as $uX;
  102. let (quo_hi, rem_3) = $half_division(duo_hi, div_0);
  103. let duo_mid = ((duo >> $n_h) as $uH as $uX) | (rem_3 << $n_h);
  104. let (quo_1, rem_2) = $half_division(duo_mid, div_0);
  105. let duo_lo = (duo as $uH as $uX) | (rem_2 << $n_h);
  106. let (quo_0, rem_1) = $half_division(duo_lo, div_0);
  107. return (
  108. (quo_0 as $uD) | ((quo_1 as $uD) << $n_h) | ((quo_hi as $uD) << n),
  109. rem_1 as $uD,
  110. );
  111. }
  112. // This is basically a short division composed of a half division for the hi
  113. // part, specialized 3 variable binary long division in the middle, and
  114. // another half division for the lo part.
  115. let duo_lo = duo as $uX;
  116. let tmp = $half_division(duo_hi, div_lo);
  117. let quo_hi = tmp.0;
  118. let mut duo = (duo_lo as $uD) | ((tmp.1 as $uD) << n);
  119. // This check is required to avoid breaking the long division below.
  120. if duo < div {
  121. return ((quo_hi as $uD) << n, duo);
  122. }
  123. // The half division handled all shift alignments down to `n`, so this
  124. // division can continue with a shift of `n - 1`.
  125. let mut div: $uD = div << (n - 1);
  126. let mut pow_lo: $uX = 1 << (n - 1);
  127. let mut quo_lo: $uX = 0;
  128. loop {
  129. let sub = duo.wrapping_sub(div);
  130. if 0 <= (sub as $iD) {
  131. duo = sub;
  132. quo_lo |= pow_lo;
  133. let duo_hi = (duo >> n) as $uX;
  134. if duo_hi == 0 {
  135. // Delegate to get the rest of the quotient. Note that the
  136. // `div_lo` here is the original unshifted `div`.
  137. let tmp = $half_division(duo as $uX, div_lo);
  138. return (
  139. (tmp.0) as $uD | (quo_lo as $uD) | ((quo_hi as $uD) << n),
  140. tmp.1 as $uD,
  141. );
  142. }
  143. }
  144. div >>= 1;
  145. pow_lo >>= 1;
  146. }
  147. }
  148. }
  149. (_, false, false) => {
  150. // Full $uD by $uD binary long division. `quo_hi` will always be 0.
  151. if duo < div {
  152. return (0, duo);
  153. }
  154. let div_original = div;
  155. let shl = $half_normalization_shift(duo_hi, div_hi, false);
  156. let mut duo = duo;
  157. let mut div: $uD = div << shl;
  158. let mut pow_lo: $uX = 1 << shl;
  159. let mut quo_lo: $uX = 0;
  160. loop {
  161. let sub = duo.wrapping_sub(div);
  162. if 0 <= (sub as $iD) {
  163. duo = sub;
  164. quo_lo |= pow_lo;
  165. if duo < div_original {
  166. return (quo_lo as $uD, duo);
  167. }
  168. }
  169. div >>= 1;
  170. pow_lo >>= 1;
  171. }
  172. }
  173. }
  174. }
  175. };
  176. }
  177. public_test_dep! {
  178. /// Returns `n / d` and sets `*rem = n % d`.
  179. ///
  180. /// This specialization exists because:
  181. /// - The LLVM backend for 32-bit SPARC cannot compile functions that return `(u128, u128)`,
  182. /// so we have to use an old fashioned `&mut u128` argument to return the remainder.
  183. /// - 64-bit SPARC does not have u64 * u64 => u128 widening multiplication, which makes the
  184. /// delegate algorithm strategy the only reasonably fast way to perform `u128` division.
  185. // used on SPARC
  186. #[allow(dead_code)]
  187. pub(crate) fn u128_divide_sparc(duo: u128, div: u128, rem: &mut u128) -> u128 {
  188. use super::*;
  189. let duo_lo = duo as u64;
  190. let duo_hi = (duo >> 64) as u64;
  191. let div_lo = div as u64;
  192. let div_hi = (div >> 64) as u64;
  193. match (div_lo == 0, div_hi == 0, duo_hi == 0) {
  194. (true, true, _) => zero_div_fn(),
  195. (_, false, true) => {
  196. *rem = duo;
  197. return 0;
  198. }
  199. (false, true, true) => {
  200. let tmp = u64_by_u64_div_rem(duo_lo, div_lo);
  201. *rem = tmp.1 as u128;
  202. return tmp.0 as u128;
  203. }
  204. (false, true, false) => {
  205. if duo_hi < div_lo {
  206. let norm_shift = u64_normalization_shift(div_lo, duo_hi, false);
  207. let shl = if norm_shift == 0 {
  208. 64 - 1
  209. } else {
  210. 64 - norm_shift
  211. };
  212. let mut div: u128 = div << shl;
  213. let mut pow_lo: u64 = 1 << shl;
  214. let mut quo_lo: u64 = 0;
  215. let mut duo = duo;
  216. loop {
  217. let sub = duo.wrapping_sub(div);
  218. if 0 <= (sub as i128) {
  219. duo = sub;
  220. quo_lo |= pow_lo;
  221. let duo_hi = (duo >> 64) as u64;
  222. if duo_hi == 0 {
  223. let tmp = u64_by_u64_div_rem(duo as u64, div_lo);
  224. *rem = tmp.1 as u128;
  225. return (quo_lo | tmp.0) as u128;
  226. }
  227. }
  228. div >>= 1;
  229. pow_lo >>= 1;
  230. }
  231. } else if duo_hi == div_lo {
  232. let tmp = u64_by_u64_div_rem(duo as u64, div as u64);
  233. *rem = tmp.1 as u128;
  234. return (1 << 64) | (tmp.0 as u128);
  235. } else {
  236. if (div_lo >> 32) == 0 {
  237. let div_0 = div_lo as u32 as u64;
  238. let (quo_hi, rem_3) = u64_by_u64_div_rem(duo_hi, div_0);
  239. let duo_mid = ((duo >> 32) as u32 as u64) | (rem_3 << 32);
  240. let (quo_1, rem_2) = u64_by_u64_div_rem(duo_mid, div_0);
  241. let duo_lo = (duo as u32 as u64) | (rem_2 << 32);
  242. let (quo_0, rem_1) = u64_by_u64_div_rem(duo_lo, div_0);
  243. *rem = rem_1 as u128;
  244. return (quo_0 as u128) | ((quo_1 as u128) << 32) | ((quo_hi as u128) << 64);
  245. }
  246. let duo_lo = duo as u64;
  247. let tmp = u64_by_u64_div_rem(duo_hi, div_lo);
  248. let quo_hi = tmp.0;
  249. let mut duo = (duo_lo as u128) | ((tmp.1 as u128) << 64);
  250. if duo < div {
  251. *rem = duo;
  252. return (quo_hi as u128) << 64;
  253. }
  254. let mut div: u128 = div << (64 - 1);
  255. let mut pow_lo: u64 = 1 << (64 - 1);
  256. let mut quo_lo: u64 = 0;
  257. loop {
  258. let sub = duo.wrapping_sub(div);
  259. if 0 <= (sub as i128) {
  260. duo = sub;
  261. quo_lo |= pow_lo;
  262. let duo_hi = (duo >> 64) as u64;
  263. if duo_hi == 0 {
  264. let tmp = u64_by_u64_div_rem(duo as u64, div_lo);
  265. *rem = tmp.1 as u128;
  266. return (tmp.0) as u128 | (quo_lo as u128) | ((quo_hi as u128) << 64);
  267. }
  268. }
  269. div >>= 1;
  270. pow_lo >>= 1;
  271. }
  272. }
  273. }
  274. (_, false, false) => {
  275. if duo < div {
  276. *rem = duo;
  277. return 0;
  278. }
  279. let div_original = div;
  280. let shl = u64_normalization_shift(duo_hi, div_hi, false);
  281. let mut duo = duo;
  282. let mut div: u128 = div << shl;
  283. let mut pow_lo: u64 = 1 << shl;
  284. let mut quo_lo: u64 = 0;
  285. loop {
  286. let sub = duo.wrapping_sub(div);
  287. if 0 <= (sub as i128) {
  288. duo = sub;
  289. quo_lo |= pow_lo;
  290. if duo < div_original {
  291. *rem = duo;
  292. return quo_lo as u128;
  293. }
  294. }
  295. div >>= 1;
  296. pow_lo >>= 1;
  297. }
  298. }
  299. }
  300. }
  301. }