memchr.rs 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388
  1. // Copyright 2015 The Rust Project Developers. See the COPYRIGHT
  2. // file at the top-level directory of this distribution and at
  3. // http://rust-lang.org/COPYRIGHT.
  4. //
  5. // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
  6. // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
  7. // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  8. // option. This file may not be copied, modified, or distributed
  9. // except according to those terms.
  10. //
  11. // Original implementation taken from rust-memchr
  12. // Copyright 2015 Andrew Gallant, bluss and Nicolas Koch
  13. /// A safe interface to `memchr`.
  14. ///
  15. /// Returns the index corresponding to the first occurrence of `needle` in
  16. /// `haystack`, or `None` if one is not found.
  17. ///
  18. /// memchr reduces to super-optimized machine code at around an order of
  19. /// magnitude faster than `haystack.iter().position(|&b| b == needle)`.
  20. /// (See benchmarks.)
  21. ///
  22. /// # Example
  23. ///
  24. /// This shows how to find the first position of a byte in a byte string.
  25. ///
  26. /// ```rust,ignore
  27. /// use memchr::memchr;
  28. ///
  29. /// let haystack = b"the quick brown fox";
  30. /// assert_eq!(memchr(b'k', haystack), Some(8));
  31. /// ```
  32. pub fn memchr(needle: u8, haystack: &[u8]) -> Option<usize> {
  33. // libc memchr
  34. #[cfg(not(target_os = "windows"))]
  35. fn memchr_specific(needle: u8, haystack: &[u8]) -> Option<usize> {
  36. use libc;
  37. let p = unsafe {
  38. libc::memchr(
  39. haystack.as_ptr() as *const libc::c_void,
  40. needle as libc::c_int,
  41. haystack.len() as libc::size_t)
  42. };
  43. if p.is_null() {
  44. None
  45. } else {
  46. Some(p as usize - (haystack.as_ptr() as usize))
  47. }
  48. }
  49. // use fallback on windows, since it's faster
  50. #[cfg(target_os = "windows")]
  51. fn memchr_specific(needle: u8, haystack: &[u8]) -> Option<usize> {
  52. fallback::memchr(needle, haystack)
  53. }
  54. memchr_specific(needle, haystack)
  55. }
  56. /// A safe interface to `memrchr`.
  57. ///
  58. /// Returns the index corresponding to the last occurrence of `needle` in
  59. /// `haystack`, or `None` if one is not found.
  60. ///
  61. /// # Example
  62. ///
  63. /// This shows how to find the last position of a byte in a byte string.
  64. ///
  65. /// ```rust,ignore
  66. /// use memchr::memrchr;
  67. ///
  68. /// let haystack = b"the quick brown fox";
  69. /// assert_eq!(memrchr(b'o', haystack), Some(17));
  70. /// ```
  71. pub fn memrchr(needle: u8, haystack: &[u8]) -> Option<usize> {
  72. #[cfg(target_os = "linux")]
  73. fn memrchr_specific(needle: u8, haystack: &[u8]) -> Option<usize> {
  74. use libc;
  75. // GNU's memrchr() will - unlike memchr() - error if haystack is empty.
  76. if haystack.is_empty() {return None}
  77. let p = unsafe {
  78. libc::memrchr(
  79. haystack.as_ptr() as *const libc::c_void,
  80. needle as libc::c_int,
  81. haystack.len() as libc::size_t)
  82. };
  83. if p.is_null() {
  84. None
  85. } else {
  86. Some(p as usize - (haystack.as_ptr() as usize))
  87. }
  88. }
  89. #[cfg(not(target_os = "linux"))]
  90. fn memrchr_specific(needle: u8, haystack: &[u8]) -> Option<usize> {
  91. fallback::memrchr(needle, haystack)
  92. }
  93. memrchr_specific(needle, haystack)
  94. }
  95. #[allow(dead_code)]
  96. mod fallback {
  97. use cmp;
  98. use mem;
  99. const LO_U64: u64 = 0x0101010101010101;
  100. const HI_U64: u64 = 0x8080808080808080;
  101. // use truncation
  102. const LO_USIZE: usize = LO_U64 as usize;
  103. const HI_USIZE: usize = HI_U64 as usize;
  104. /// Return `true` if `x` contains any zero byte.
  105. ///
  106. /// From *Matters Computational*, J. Arndt
  107. ///
  108. /// "The idea is to subtract one from each of the bytes and then look for
  109. /// bytes where the borrow propagated all the way to the most significant
  110. /// bit."
  111. #[inline]
  112. fn contains_zero_byte(x: usize) -> bool {
  113. x.wrapping_sub(LO_USIZE) & !x & HI_USIZE != 0
  114. }
  115. #[cfg(target_pointer_width = "32")]
  116. #[inline]
  117. fn repeat_byte(b: u8) -> usize {
  118. let mut rep = (b as usize) << 8 | b as usize;
  119. rep = rep << 16 | rep;
  120. rep
  121. }
  122. #[cfg(target_pointer_width = "64")]
  123. #[inline]
  124. fn repeat_byte(b: u8) -> usize {
  125. let mut rep = (b as usize) << 8 | b as usize;
  126. rep = rep << 16 | rep;
  127. rep = rep << 32 | rep;
  128. rep
  129. }
  130. /// Return the first index matching the byte `a` in `text`.
  131. pub fn memchr(x: u8, text: &[u8]) -> Option<usize> {
  132. // Scan for a single byte value by reading two `usize` words at a time.
  133. //
  134. // Split `text` in three parts
  135. // - unaligned initial part, before the first word aligned address in text
  136. // - body, scan by 2 words at a time
  137. // - the last remaining part, < 2 word size
  138. let len = text.len();
  139. let ptr = text.as_ptr();
  140. let usize_bytes = mem::size_of::<usize>();
  141. // search up to an aligned boundary
  142. let align = (ptr as usize) & (usize_bytes- 1);
  143. let mut offset;
  144. if align > 0 {
  145. offset = cmp::min(usize_bytes - align, len);
  146. if let Some(index) = text[..offset].iter().position(|elt| *elt == x) {
  147. return Some(index);
  148. }
  149. } else {
  150. offset = 0;
  151. }
  152. // search the body of the text
  153. let repeated_x = repeat_byte(x);
  154. if len >= 2 * usize_bytes {
  155. while offset <= len - 2 * usize_bytes {
  156. unsafe {
  157. let u = *(ptr.offset(offset as isize) as *const usize);
  158. let v = *(ptr.offset((offset + usize_bytes) as isize) as *const usize);
  159. // break if there is a matching byte
  160. let zu = contains_zero_byte(u ^ repeated_x);
  161. let zv = contains_zero_byte(v ^ repeated_x);
  162. if zu || zv {
  163. break;
  164. }
  165. }
  166. offset += usize_bytes * 2;
  167. }
  168. }
  169. // find the byte after the point the body loop stopped
  170. text[offset..].iter().position(|elt| *elt == x).map(|i| offset + i)
  171. }
  172. /// Return the last index matching the byte `a` in `text`.
  173. pub fn memrchr(x: u8, text: &[u8]) -> Option<usize> {
  174. // Scan for a single byte value by reading two `usize` words at a time.
  175. //
  176. // Split `text` in three parts
  177. // - unaligned tail, after the last word aligned address in text
  178. // - body, scan by 2 words at a time
  179. // - the first remaining bytes, < 2 word size
  180. let len = text.len();
  181. let ptr = text.as_ptr();
  182. let usize_bytes = mem::size_of::<usize>();
  183. // search to an aligned boundary
  184. let end_align = (ptr as usize + len) & (usize_bytes - 1);
  185. let mut offset;
  186. if end_align > 0 {
  187. offset = len - cmp::min(usize_bytes - end_align, len);
  188. if let Some(index) = text[offset..].iter().rposition(|elt| *elt == x) {
  189. return Some(offset + index);
  190. }
  191. } else {
  192. offset = len;
  193. }
  194. // search the body of the text
  195. let repeated_x = repeat_byte(x);
  196. while offset >= 2 * usize_bytes {
  197. unsafe {
  198. let u = *(ptr.offset(offset as isize - 2 * usize_bytes as isize) as *const usize);
  199. let v = *(ptr.offset(offset as isize - usize_bytes as isize) as *const usize);
  200. // break if there is a matching byte
  201. let zu = contains_zero_byte(u ^ repeated_x);
  202. let zv = contains_zero_byte(v ^ repeated_x);
  203. if zu || zv {
  204. break;
  205. }
  206. }
  207. offset -= 2 * usize_bytes;
  208. }
  209. // find the byte before the point the body loop stopped
  210. text[..offset].iter().rposition(|elt| *elt == x)
  211. }
  212. // test fallback implementations on all plattforms
  213. #[test]
  214. fn matches_one() {
  215. assert_eq!(Some(0), memchr(b'a', b"a"));
  216. }
  217. #[test]
  218. fn matches_begin() {
  219. assert_eq!(Some(0), memchr(b'a', b"aaaa"));
  220. }
  221. #[test]
  222. fn matches_end() {
  223. assert_eq!(Some(4), memchr(b'z', b"aaaaz"));
  224. }
  225. #[test]
  226. fn matches_nul() {
  227. assert_eq!(Some(4), memchr(b'\x00', b"aaaa\x00"));
  228. }
  229. #[test]
  230. fn matches_past_nul() {
  231. assert_eq!(Some(5), memchr(b'z', b"aaaa\x00z"));
  232. }
  233. #[test]
  234. fn no_match_empty() {
  235. assert_eq!(None, memchr(b'a', b""));
  236. }
  237. #[test]
  238. fn no_match() {
  239. assert_eq!(None, memchr(b'a', b"xyz"));
  240. }
  241. #[test]
  242. fn matches_one_reversed() {
  243. assert_eq!(Some(0), memrchr(b'a', b"a"));
  244. }
  245. #[test]
  246. fn matches_begin_reversed() {
  247. assert_eq!(Some(3), memrchr(b'a', b"aaaa"));
  248. }
  249. #[test]
  250. fn matches_end_reversed() {
  251. assert_eq!(Some(0), memrchr(b'z', b"zaaaa"));
  252. }
  253. #[test]
  254. fn matches_nul_reversed() {
  255. assert_eq!(Some(4), memrchr(b'\x00', b"aaaa\x00"));
  256. }
  257. #[test]
  258. fn matches_past_nul_reversed() {
  259. assert_eq!(Some(0), memrchr(b'z', b"z\x00aaaa"));
  260. }
  261. #[test]
  262. fn no_match_empty_reversed() {
  263. assert_eq!(None, memrchr(b'a', b""));
  264. }
  265. #[test]
  266. fn no_match_reversed() {
  267. assert_eq!(None, memrchr(b'a', b"xyz"));
  268. }
  269. }
  270. #[cfg(test)]
  271. mod tests {
  272. // test the implementations for the current plattform
  273. use super::{memchr, memrchr};
  274. #[test]
  275. fn matches_one() {
  276. assert_eq!(Some(0), memchr(b'a', b"a"));
  277. }
  278. #[test]
  279. fn matches_begin() {
  280. assert_eq!(Some(0), memchr(b'a', b"aaaa"));
  281. }
  282. #[test]
  283. fn matches_end() {
  284. assert_eq!(Some(4), memchr(b'z', b"aaaaz"));
  285. }
  286. #[test]
  287. fn matches_nul() {
  288. assert_eq!(Some(4), memchr(b'\x00', b"aaaa\x00"));
  289. }
  290. #[test]
  291. fn matches_past_nul() {
  292. assert_eq!(Some(5), memchr(b'z', b"aaaa\x00z"));
  293. }
  294. #[test]
  295. fn no_match_empty() {
  296. assert_eq!(None, memchr(b'a', b""));
  297. }
  298. #[test]
  299. fn no_match() {
  300. assert_eq!(None, memchr(b'a', b"xyz"));
  301. }
  302. #[test]
  303. fn matches_one_reversed() {
  304. assert_eq!(Some(0), memrchr(b'a', b"a"));
  305. }
  306. #[test]
  307. fn matches_begin_reversed() {
  308. assert_eq!(Some(3), memrchr(b'a', b"aaaa"));
  309. }
  310. #[test]
  311. fn matches_end_reversed() {
  312. assert_eq!(Some(0), memrchr(b'z', b"zaaaa"));
  313. }
  314. #[test]
  315. fn matches_nul_reversed() {
  316. assert_eq!(Some(4), memrchr(b'\x00', b"aaaa\x00"));
  317. }
  318. #[test]
  319. fn matches_past_nul_reversed() {
  320. assert_eq!(Some(0), memrchr(b'z', b"z\x00aaaa"));
  321. }
  322. #[test]
  323. fn no_match_empty_reversed() {
  324. assert_eq!(None, memrchr(b'a', b""));
  325. }
  326. #[test]
  327. fn no_match_reversed() {
  328. assert_eq!(None, memrchr(b'a', b"xyz"));
  329. }
  330. }