Skip to content

Commit e8fad32

Browse files
committed
Make [u8]::reverse() 5x faster
Since LLVM doesn't vectorize the loop for us, do unaligned reads of a larger type and use LLVM's bswap intrinsic to do the reversing of the actual bytes. cfg!-restricted to x86 and x86_64, as I assume it wouldn't help on things like ARMv5. Also makes [u16]::reverse() a more modest 1.5x faster by loading/storing u32 and swapping the u16s with ROT16. Thank you ptr::*_unaligned for making this easy :)
1 parent 06fb4d2 commit e8fad32

File tree

3 files changed

+69
-0
lines changed

3 files changed

+69
-0
lines changed

src/libcollections/benches/slice.rs

+21
Original file line numberDiff line numberDiff line change
@@ -290,3 +290,24 @@ sort!(sort_unstable, sort_unstable_large_random, gen_random, 10000);
290290
sort!(sort_unstable, sort_unstable_large_big_random, gen_big_random, 10000);
291291
sort!(sort_unstable, sort_unstable_large_strings, gen_strings, 10000);
292292
sort_expensive!(sort_unstable_by, sort_unstable_large_random_expensive, gen_random, 10000);
293+
294+
macro_rules! reverse {
295+
($name:ident, $ty:ident) => {
296+
#[bench]
297+
fn $name(b: &mut Bencher) {
298+
// odd length and offset by 1 to be as unaligned as possible
299+
let n = 0xFFFFF;
300+
let mut v: Vec<_> =
301+
(0..1+(n / mem::size_of::<$ty>() as u64))
302+
.map(|x| x as $ty)
303+
.collect();
304+
b.iter(|| black_box(&mut v[1..]).reverse());
305+
b.bytes = n;
306+
}
307+
}
308+
}
309+
310+
reverse!(reverse_u8, u8);
311+
reverse!(reverse_u16, u16);
312+
reverse!(reverse_u32, u32);
313+
reverse!(reverse_u64, u64);

src/libcollections/tests/slice.rs

+10
Original file line numberDiff line numberDiff line change
@@ -379,6 +379,16 @@ fn test_reverse() {
379379
let mut v3 = Vec::<i32>::new();
380380
v3.reverse();
381381
assert!(v3.is_empty());
382+
383+
// check the 1-byte-types path
384+
let mut v = (-50..51i8).collect::<Vec<_>>();
385+
v.reverse();
386+
assert_eq!(v, (-50..51i8).rev().collect::<Vec<_>>());
387+
388+
// check the 2-byte-types path
389+
let mut v = (-50..51i16).collect::<Vec<_>>();
390+
v.reverse();
391+
assert_eq!(v, (-50..51i16).rev().collect::<Vec<_>>());
382392
}
383393

384394
#[test]

src/libcore/slice/mod.rs

+38
Original file line numberDiff line numberDiff line change
@@ -539,6 +539,44 @@ impl<T> SliceExt for [T] {
539539
fn reverse(&mut self) {
540540
let mut i: usize = 0;
541541
let ln = self.len();
542+
543+
let fast_unaligned =
544+
cfg!(any(target_arch = "x86", target_arch = "x86_64"));
545+
546+
if fast_unaligned && mem::size_of::<T>() == 1 {
547+
// Single-byte read & write are comparatively slow. Instead,
548+
// work in usize chunks and get bswap to do the hard work.
549+
let chunk = mem::size_of::<usize>();
550+
while i + chunk - 1 < ln / 2 {
551+
unsafe {
552+
let pa: *mut T = self.get_unchecked_mut(i);
553+
let pb: *mut T = self.get_unchecked_mut(ln - i - chunk);
554+
let va = ptr::read_unaligned(pa as *mut usize);
555+
let vb = ptr::read_unaligned(pb as *mut usize);
556+
ptr::write_unaligned(pa as *mut usize, vb.swap_bytes());
557+
ptr::write_unaligned(pb as *mut usize, va.swap_bytes());
558+
}
559+
i += chunk;
560+
}
561+
}
562+
563+
if fast_unaligned && mem::size_of::<T>() == 2 {
564+
// Not quite as good as the above, but still helpful.
565+
// Same general idea, read bigger and do the swap in a register.
566+
let chunk = mem::size_of::<u32>() / 2;
567+
while i + chunk - 1 < ln / 2 {
568+
unsafe {
569+
let pa: *mut T = self.get_unchecked_mut(i);
570+
let pb: *mut T = self.get_unchecked_mut(ln - i - chunk);
571+
let va = ptr::read_unaligned(pa as *mut u32);
572+
let vb = ptr::read_unaligned(pb as *mut u32);
573+
ptr::write_unaligned(pa as *mut u32, vb.rotate_left(16));
574+
ptr::write_unaligned(pb as *mut u32, va.rotate_left(16));
575+
}
576+
i += chunk;
577+
}
578+
}
579+
542580
while i < ln / 2 {
543581
// Unsafe swap to avoid the bounds check in safe swap.
544582
unsafe {

0 commit comments

Comments
 (0)