mod.rs 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306
  1. // TODO: when `unsafe_block_in_unsafe_fn` is stabilized, remove this
  2. #![allow(unused_unsafe)]
  3. // The functions are complex with many branches, and explicit
  4. // `return`s makes it clear where function exit points are
  5. #![allow(clippy::needless_return)]
  6. #![allow(clippy::comparison_chain)]
  7. // Clippy is confused by the complex configuration
  8. #![allow(clippy::if_same_then_else)]
  9. #![allow(clippy::needless_bool)]
  10. //! This `specialized_div_rem` module is originally from version 1.0.0 of the
  11. //! `specialized-div-rem` crate. Note that `for` loops with ranges are not used in this
  12. //! module, since unoptimized compilation may generate references to `memcpy`.
  13. //!
  14. //! The purpose of these macros is to easily change the both the division algorithm used
  15. //! for a given integer size and the half division used by that algorithm. The way
  16. //! functions call each other is also constructed such that linkers will find the chain of
  17. //! software and hardware divisions needed for every size of signed and unsigned division.
  18. //! For example, most target compilations do the following:
  19. //!
  20. //! - Many 128 bit division functions like `u128::wrapping_div` use
  21. //! `std::intrinsics::unchecked_div`, which gets replaced by `__udivti3` because there
  22. //! is not a 128 bit by 128 bit hardware division function in most architectures.
  23. //! `__udivti3` uses `u128_div_rem` (this extra level of function calls exists because
  24. //! `__umodti3` and `__udivmodti4` also exist, and `specialized_div_rem` supplies just
  25. //! one function to calculate both the quotient and remainder. If configuration flags
  26. //! enable it, `impl_trifecta!` defines `u128_div_rem` to use the trifecta algorithm,
  27. //! which requires the half sized division `u64_by_u64_div_rem`. If the architecture
  28. //! supplies a 64 bit hardware division instruction, `u64_by_u64_div_rem` will be
  29. //! reduced to those instructions. Note that we do not specify the half size division
  30. //! directly to be `__udivdi3`, because hardware division would never be introduced.
  31. //! - If the architecture does not supply a 64 bit hardware division instruction, u64
  32. //! divisions will use functions such as `__udivdi3`. This will call `u64_div_rem`
  33. //! which is defined by `impl_delegate!`. The half division for this algorithm is
  34. //! `u32_by_u32_div_rem` which in turn becomes hardware division instructions or more
  35. //! software division algorithms.
  36. //! - If the architecture does not supply a 32 bit hardware instruction, linkers will
  37. //! look for `__udivsi3`. `impl_binary_long!` is used, but this algorithm uses no half
  38. //! division, so the chain of calls ends here.
  39. //!
  40. //! On some architectures like x86_64, an asymmetrically sized division is supplied, in
  41. //! which 128 bit numbers can be divided by 64 bit numbers. `impl_asymmetric!` is used to
  42. //! extend the 128 by 64 bit division to a full 128 by 128 bit division.
  43. // `allow(dead_code)` is used in various places, because the configuration code would otherwise be
  44. // ridiculously complex
  45. #[macro_use]
  46. mod norm_shift;
  47. #[macro_use]
  48. mod binary_long;
  49. #[macro_use]
  50. mod delegate;
  51. // used on SPARC
  52. #[allow(unused_imports)]
  53. #[cfg(not(feature = "public-test-deps"))]
  54. pub(crate) use self::delegate::u128_divide_sparc;
  55. #[cfg(feature = "public-test-deps")]
  56. pub use self::delegate::u128_divide_sparc;
  57. #[macro_use]
  58. mod trifecta;
  59. #[macro_use]
  60. mod asymmetric;
  61. /// The behavior of all divisions by zero is controlled by this function. This function should be
  62. /// impossible to reach by Rust users, unless `compiler-builtins` public division functions or
  63. /// `core/std::unchecked_div/rem` are directly used without a zero check in front.
  64. fn zero_div_fn() -> ! {
  65. unsafe { core::hint::unreachable_unchecked() }
  66. }
  67. const USE_LZ: bool = {
  68. if cfg!(target_arch = "arm") {
  69. if cfg!(target_feature = "thumb-mode") {
  70. // ARM thumb targets have CLZ instructions if the instruction set of ARMv6T2 is
  71. // supported. This is needed to successfully differentiate between targets like
  72. // `thumbv8.base` and `thumbv8.main`.
  73. cfg!(target_feature = "v6t2")
  74. } else {
  75. // Regular ARM targets have CLZ instructions if the ARMv5TE instruction set is
  76. // supported. Technically, ARMv5T was the first to have CLZ, but the "v5t" target
  77. // feature does not seem to work.
  78. cfg!(target_feature = "v5te")
  79. }
  80. } else if cfg!(any(target_arch = "sparc", target_arch = "sparc64")) {
  81. // LZD or LZCNT on SPARC only exists for the VIS 3 extension and later.
  82. cfg!(target_feature = "vis3")
  83. } else if cfg!(any(target_arch = "riscv32", target_arch = "riscv64")) {
  84. // The `B` extension on RISC-V determines if a CLZ assembly instruction exists
  85. cfg!(target_feature = "b")
  86. } else {
  87. // All other common targets Rust supports should have CLZ instructions
  88. true
  89. }
  90. };
  91. impl_normalization_shift!(
  92. u32_normalization_shift,
  93. USE_LZ,
  94. 32,
  95. u32,
  96. i32,
  97. allow(dead_code)
  98. );
  99. impl_normalization_shift!(
  100. u64_normalization_shift,
  101. USE_LZ,
  102. 64,
  103. u64,
  104. i64,
  105. allow(dead_code)
  106. );
  107. /// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
  108. /// `checked_div` and `checked_rem` are used to avoid bringing in panic function
  109. /// dependencies.
  110. #[inline]
  111. fn u64_by_u64_div_rem(duo: u64, div: u64) -> (u64, u64) {
  112. if let Some(quo) = duo.checked_div(div) {
  113. if let Some(rem) = duo.checked_rem(div) {
  114. return (quo, rem);
  115. }
  116. }
  117. zero_div_fn()
  118. }
  119. // Whether `trifecta` or `delegate` is faster for 128 bit division depends on the speed at which a
  120. // microarchitecture can multiply and divide. We decide to be optimistic and assume `trifecta` is
  121. // faster if the target pointer width is at least 64.
  122. #[cfg(all(
  123. not(any(target_pointer_width = "16", target_pointer_width = "32")),
  124. not(all(not(feature = "no-asm"), target_arch = "x86_64")),
  125. not(any(target_arch = "sparc", target_arch = "sparc64"))
  126. ))]
  127. impl_trifecta!(
  128. u128_div_rem,
  129. zero_div_fn,
  130. u64_by_u64_div_rem,
  131. 32,
  132. u32,
  133. u64,
  134. u128
  135. );
  136. // If the pointer width less than 64, then the target architecture almost certainly does not have
  137. // the fast 64 to 128 bit widening multiplication needed for `trifecta` to be faster.
  138. #[cfg(all(
  139. any(target_pointer_width = "16", target_pointer_width = "32"),
  140. not(all(not(feature = "no-asm"), target_arch = "x86_64")),
  141. not(any(target_arch = "sparc", target_arch = "sparc64"))
  142. ))]
  143. impl_delegate!(
  144. u128_div_rem,
  145. zero_div_fn,
  146. u64_normalization_shift,
  147. u64_by_u64_div_rem,
  148. 32,
  149. u32,
  150. u64,
  151. u128,
  152. i128
  153. );
  154. /// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
  155. ///
  156. /// # Safety
  157. ///
  158. /// If the quotient does not fit in a `u64`, a floating point exception occurs.
  159. /// If `div == 0`, then a division by zero exception occurs.
  160. #[cfg(all(not(feature = "no-asm"), target_arch = "x86_64"))]
  161. #[inline]
  162. unsafe fn u128_by_u64_div_rem(duo: u128, div: u64) -> (u64, u64) {
  163. let duo_lo = duo as u64;
  164. let duo_hi = (duo >> 64) as u64;
  165. let quo: u64;
  166. let rem: u64;
  167. unsafe {
  168. // divides the combined registers rdx:rax (`duo` is split into two 64 bit parts to do this)
  169. // by `div`. The quotient is stored in rax and the remainder in rdx.
  170. // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
  171. core::arch::asm!(
  172. "div {0}",
  173. in(reg) div,
  174. inlateout("rax") duo_lo => quo,
  175. inlateout("rdx") duo_hi => rem,
  176. options(att_syntax, pure, nomem, nostack)
  177. );
  178. }
  179. (quo, rem)
  180. }
  181. // use `asymmetric` instead of `trifecta` on x86_64
  182. #[cfg(all(not(feature = "no-asm"), target_arch = "x86_64"))]
  183. impl_asymmetric!(
  184. u128_div_rem,
  185. zero_div_fn,
  186. u64_by_u64_div_rem,
  187. u128_by_u64_div_rem,
  188. 32,
  189. u32,
  190. u64,
  191. u128
  192. );
  193. /// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
  194. /// `checked_div` and `checked_rem` are used to avoid bringing in panic function
  195. /// dependencies.
  196. #[inline]
  197. #[allow(dead_code)]
  198. fn u32_by_u32_div_rem(duo: u32, div: u32) -> (u32, u32) {
  199. if let Some(quo) = duo.checked_div(div) {
  200. if let Some(rem) = duo.checked_rem(div) {
  201. return (quo, rem);
  202. }
  203. }
  204. zero_div_fn()
  205. }
  206. // When not on x86 and the pointer width is not 64, use `delegate` since the division size is larger
  207. // than register size.
  208. #[cfg(all(
  209. not(all(not(feature = "no-asm"), target_arch = "x86")),
  210. not(target_pointer_width = "64")
  211. ))]
  212. impl_delegate!(
  213. u64_div_rem,
  214. zero_div_fn,
  215. u32_normalization_shift,
  216. u32_by_u32_div_rem,
  217. 16,
  218. u16,
  219. u32,
  220. u64,
  221. i64
  222. );
  223. // When not on x86 and the pointer width is 64, use `binary_long`.
  224. #[cfg(all(
  225. not(all(not(feature = "no-asm"), target_arch = "x86")),
  226. target_pointer_width = "64"
  227. ))]
  228. impl_binary_long!(
  229. u64_div_rem,
  230. zero_div_fn,
  231. u64_normalization_shift,
  232. 64,
  233. u64,
  234. i64
  235. );
  236. /// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
  237. ///
  238. /// # Safety
  239. ///
  240. /// If the quotient does not fit in a `u32`, a floating point exception occurs.
  241. /// If `div == 0`, then a division by zero exception occurs.
  242. #[cfg(all(not(feature = "no-asm"), target_arch = "x86"))]
  243. #[inline]
  244. unsafe fn u64_by_u32_div_rem(duo: u64, div: u32) -> (u32, u32) {
  245. let duo_lo = duo as u32;
  246. let duo_hi = (duo >> 32) as u32;
  247. let quo: u32;
  248. let rem: u32;
  249. unsafe {
  250. // divides the combined registers rdx:rax (`duo` is split into two 32 bit parts to do this)
  251. // by `div`. The quotient is stored in rax and the remainder in rdx.
  252. // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
  253. core::arch::asm!(
  254. "div {0}",
  255. in(reg) div,
  256. inlateout("rax") duo_lo => quo,
  257. inlateout("rdx") duo_hi => rem,
  258. options(att_syntax, pure, nomem, nostack)
  259. );
  260. }
  261. (quo, rem)
  262. }
  263. // use `asymmetric` instead of `delegate` on x86
  264. #[cfg(all(not(feature = "no-asm"), target_arch = "x86"))]
  265. impl_asymmetric!(
  266. u64_div_rem,
  267. zero_div_fn,
  268. u32_by_u32_div_rem,
  269. u64_by_u32_div_rem,
  270. 16,
  271. u16,
  272. u32,
  273. u64
  274. );
  275. // 32 bits is the smallest division used by `compiler-builtins`, so we end with binary long division
  276. impl_binary_long!(
  277. u32_div_rem,
  278. zero_div_fn,
  279. u32_normalization_shift,
  280. 32,
  281. u32,
  282. i32
  283. );