diff --git a/library/core/src/slice/rotate.rs b/library/core/src/slice/rotate.rs index d8e0acb565c8c..5d5ee4c7b6240 100644 --- a/library/core/src/slice/rotate.rs +++ b/library/core/src/slice/rotate.rs @@ -1,6 +1,8 @@ use crate::mem::{self, MaybeUninit, SizedTypeProperties}; use crate::{cmp, ptr}; +type BufType = [usize; 32]; + /// Rotates the range `[mid-left, mid+right)` such that the element at `mid` becomes the first /// element. Equivalently, rotates the range `left` elements to the left or `right` elements to the /// right. @@ -8,14 +10,82 @@ use crate::{cmp, ptr}; /// # Safety /// /// The specified range must be valid for reading and writing. +#[inline] +pub(super) unsafe fn ptr_rotate(left: usize, mid: *mut T, right: usize) { + if T::IS_ZST { + return; + } + // abort early if the rotate is a no-op + if (left == 0) || (right == 0) { + return; + } + // `T` is not a zero-sized type, so it's okay to divide by its size. + if !cfg!(feature = "optimize_for_size") + && cmp::min(left, right) <= mem::size_of::() / mem::size_of::() + { + // SAFETY: guaranteed by the caller + unsafe { ptr_rotate_memmove(left, mid, right) }; + } else if !cfg!(feature = "optimize_for_size") + && ((left + right < 24) || (mem::size_of::() > mem::size_of::<[usize; 4]>())) + { + // SAFETY: guaranteed by the caller + unsafe { ptr_rotate_gcd(left, mid, right) } + } else { + // SAFETY: guaranteed by the caller + unsafe { ptr_rotate_swap(left, mid, right) } + } +} + +/// Algorithm 1 is used if `min(left, right)` is small enough to fit onto a stack buffer. The +/// `min(left, right)` elements are copied onto the buffer, `memmove` is applied to the others, and +/// the ones on the buffer are moved back into the hole on the opposite side of where they +/// originated. /// -/// # Algorithm +/// # Safety /// -/// Algorithm 1 is used for small values of `left + right` or for large `T`. The elements are moved -/// into their final positions one at a time starting at `mid - left` and advancing by `right` steps -/// modulo `left + right`, such that only one temporary is needed. Eventually, we arrive back at -/// `mid - left`. However, if `gcd(left + right, right)` is not 1, the above steps skipped over -/// elements. For example: +/// The specified range must be valid for reading and writing. +#[inline] +unsafe fn ptr_rotate_memmove(left: usize, mid: *mut T, right: usize) { + // The `[T; 0]` here is to ensure this is appropriately aligned for T + let mut rawarray = MaybeUninit::<(BufType, [T; 0])>::uninit(); + let buf = rawarray.as_mut_ptr() as *mut T; + // SAFETY: `mid-left <= mid-left+right < mid+right` + let dim = unsafe { mid.sub(left).add(right) }; + if left <= right { + // SAFETY: + // + // 1) The `if` condition about the sizes ensures `[mid-left; left]` will fit in + // `buf` without overflow and `buf` was created just above and so cannot be + // overlapped with any value of `[mid-left; left]` + // 2) [mid-left, mid+right) are all valid for reading and writing and we don't care + // about overlaps here. + // 3) The `if` condition about `left <= right` ensures writing `left` elements to + // `dim = mid-left+right` is valid because: + // - `buf` is valid and `left` elements were written in it in 1) + // - `dim+left = mid-left+right+left = mid+right` and we write `[dim, dim+left)` + unsafe { + // 1) + ptr::copy_nonoverlapping(mid.sub(left), buf, left); + // 2) + ptr::copy(mid, mid.sub(left), right); + // 3) + ptr::copy_nonoverlapping(buf, dim, left); + } + } else { + // SAFETY: same reasoning as above but with `left` and `right` reversed + unsafe { + ptr::copy_nonoverlapping(mid, buf, right); + ptr::copy(mid.sub(left), dim, left); + ptr::copy_nonoverlapping(buf, mid.sub(left), right); + } + } +} + +/// Algorithm 2 is used for small values of `left + right` or for large `T`. The elements +/// are moved into their final positions one at a time starting at `mid - left` and advancing by +/// `right` steps modulo `left + right`, such that only one temporary is needed. Eventually, we +/// arrive back at `mid - left`. However, if `gcd(left + right, right)` is not 1, the above steps +/// skipped over elements. For example: /// ```text /// left = 10, right = 6 /// the `^` indicates an element in its final place @@ -39,17 +109,104 @@ use crate::{cmp, ptr}; /// `gcd(left + right, right)` value). The end result is that all elements are finalized once and /// only once. /// -/// Algorithm 2 is used if `left + right` is large but `min(left, right)` is small enough to -/// fit onto a stack buffer. The `min(left, right)` elements are copied onto the buffer, `memmove` -/// is applied to the others, and the ones on the buffer are moved back into the hole on the -/// opposite side of where they originated. -/// -/// Algorithms that can be vectorized outperform the above once `left + right` becomes large enough. -/// Algorithm 1 can be vectorized by chunking and performing many rounds at once, but there are too +/// Algorithm 2 can be vectorized by chunking and performing many rounds at once, but there are too /// few rounds on average until `left + right` is enormous, and the worst case of a single -/// round is always there. Instead, algorithm 3 utilizes repeated swapping of -/// `min(left, right)` elements until a smaller rotate problem is left. +/// round is always there. +/// +/// # Safety +/// +/// The specified range must be valid for reading and writing. +#[inline] +unsafe fn ptr_rotate_gcd(left: usize, mid: *mut T, right: usize) { + // Algorithm 2 + // Microbenchmarks indicate that the average performance for random shifts is better all + // the way until about `left + right == 32`, but the worst case performance breaks even + // around 16. 24 was chosen as middle ground. If the size of `T` is larger than 4 + // `usize`s, this algorithm also outperforms other algorithms. + // SAFETY: callers must ensure `mid - left` is valid for reading and writing. + let x = unsafe { mid.sub(left) }; + // beginning of first round + // SAFETY: see previous comment. + let mut tmp: T = unsafe { x.read() }; + let mut i = right; + // `gcd` can be found before hand by calculating `gcd(left + right, right)`, + // but it is faster to do one loop which calculates the gcd as a side effect, then + // doing the rest of the chunk + let mut gcd = right; + // benchmarks reveal that it is faster to swap temporaries all the way through instead + // of reading one temporary once, copying backwards, and then writing that temporary at + // the very end. This is possibly due to the fact that swapping or replacing temporaries + // uses only one memory address in the loop instead of needing to manage two. + loop { + // [long-safety-expl] + // SAFETY: callers must ensure `[left, left+mid+right)` are all valid for reading and + // writing. + // + // - `i` start with `right` so `mid-left <= x+i = x+right = mid-left+right < mid+right` + // - `i <= left+right-1` is always true + // - if `i < left`, `right` is added so `i < left+right` and on the next + // iteration `left` is removed from `i` so it doesn't go further + // - if `i >= left`, `left` is removed immediately and so it doesn't go further. + // - overflows cannot happen for `i` since the function's safety contract ask for + // `mid+right-1 = x+left+right` to be valid for writing + // - underflows cannot happen because `i` must be bigger or equal to `left` for + // a subtraction of `left` to happen. + // + // So `x+i` is valid for reading and writing if the caller respected the contract + tmp = unsafe { x.add(i).replace(tmp) }; + // instead of incrementing `i` and then checking if it is outside the bounds, we + // check if `i` will go outside the bounds on the next increment. This prevents + // any wrapping of pointers or `usize`. + if i >= left { + i -= left; + if i == 0 { + // end of first round + // SAFETY: tmp has been read from a valid source and x is valid for writing + // according to the caller. + unsafe { x.write(tmp) }; + break; + } + // this conditional must be here if `left + right >= 15` + if i < gcd { + gcd = i; + } + } else { + i += right; + } + } + // finish the chunk with more rounds + for start in 1..gcd { + // SAFETY: `gcd` is at most equal to `right` so all values in `1..gcd` are valid for + // reading and writing as per the function's safety contract, see [long-safety-expl] + // above + tmp = unsafe { x.add(start).read() }; + // [safety-expl-addition] + // + // Here `start < gcd` so `start < right` so `i < right+right`: `right` being the + // greatest common divisor of `(left+right, right)` means that `left = right` so + // `i < left+right` so `x+i = mid-left+i` is always valid for reading and writing + // according to the function's safety contract. + i = start + right; + loop { + // SAFETY: see [long-safety-expl] and [safety-expl-addition] + tmp = unsafe { x.add(i).replace(tmp) }; + if i >= left { + i -= left; + if i == start { + // SAFETY: see [long-safety-expl] and [safety-expl-addition] + unsafe { x.add(start).write(tmp) }; + break; + } + } else { + i += right; + } + } + } +} + +/// Algorithm 3 utilizes repeated swapping of `min(left, right)` elements. /// +/// /// /// ```text /// left = 11, right = 4 /// [4 5 6 7 8 9 10 11 12 13 14 . 0 1 2 3] @@ -60,144 +217,14 @@ use crate::{cmp, ptr}; /// we cannot swap any more, but a smaller rotation problem is left to solve /// ``` /// when `left < right` the swapping happens from the left instead. -pub(super) unsafe fn ptr_rotate(mut left: usize, mut mid: *mut T, mut right: usize) { - type BufType = [usize; 32]; - if T::IS_ZST { - return; - } +/// +/// # Safety +/// +/// The specified range must be valid for reading and writing. +#[inline] +unsafe fn ptr_rotate_swap(mut left: usize, mut mid: *mut T, mut right: usize) { loop { - // N.B. the below algorithms can fail if these cases are not checked - if (right == 0) || (left == 0) { - return; - } - if !cfg!(feature = "optimize_for_size") - && ((left + right < 24) || (mem::size_of::() > mem::size_of::<[usize; 4]>())) - { - // Algorithm 1 - // Microbenchmarks indicate that the average performance for random shifts is better all - // the way until about `left + right == 32`, but the worst case performance breaks even - // around 16. 24 was chosen as middle ground. If the size of `T` is larger than 4 - // `usize`s, this algorithm also outperforms other algorithms. - // SAFETY: callers must ensure `mid - left` is valid for reading and writing. - let x = unsafe { mid.sub(left) }; - // beginning of first round - // SAFETY: see previous comment. - let mut tmp: T = unsafe { x.read() }; - let mut i = right; - // `gcd` can be found before hand by calculating `gcd(left + right, right)`, - // but it is faster to do one loop which calculates the gcd as a side effect, then - // doing the rest of the chunk - let mut gcd = right; - // benchmarks reveal that it is faster to swap temporaries all the way through instead - // of reading one temporary once, copying backwards, and then writing that temporary at - // the very end. This is possibly due to the fact that swapping or replacing temporaries - // uses only one memory address in the loop instead of needing to manage two. - loop { - // [long-safety-expl] - // SAFETY: callers must ensure `[left, left+mid+right)` are all valid for reading and - // writing. - // - // - `i` start with `right` so `mid-left <= x+i = x+right = mid-left+right < mid+right` - // - `i <= left+right-1` is always true - // - if `i < left`, `right` is added so `i < left+right` and on the next - // iteration `left` is removed from `i` so it doesn't go further - // - if `i >= left`, `left` is removed immediately and so it doesn't go further. - // - overflows cannot happen for `i` since the function's safety contract ask for - // `mid+right-1 = x+left+right` to be valid for writing - // - underflows cannot happen because `i` must be bigger or equal to `left` for - // a subtraction of `left` to happen. - // - // So `x+i` is valid for reading and writing if the caller respected the contract - tmp = unsafe { x.add(i).replace(tmp) }; - // instead of incrementing `i` and then checking if it is outside the bounds, we - // check if `i` will go outside the bounds on the next increment. This prevents - // any wrapping of pointers or `usize`. - if i >= left { - i -= left; - if i == 0 { - // end of first round - // SAFETY: tmp has been read from a valid source and x is valid for writing - // according to the caller. - unsafe { x.write(tmp) }; - break; - } - // this conditional must be here if `left + right >= 15` - if i < gcd { - gcd = i; - } - } else { - i += right; - } - } - // finish the chunk with more rounds - for start in 1..gcd { - // SAFETY: `gcd` is at most equal to `right` so all values in `1..gcd` are valid for - // reading and writing as per the function's safety contract, see [long-safety-expl] - // above - tmp = unsafe { x.add(start).read() }; - // [safety-expl-addition] - // - // Here `start < gcd` so `start < right` so `i < right+right`: `right` being the - // greatest common divisor of `(left+right, right)` means that `left = right` so - // `i < left+right` so `x+i = mid-left+i` is always valid for reading and writing - // according to the function's safety contract. - i = start + right; - loop { - // SAFETY: see [long-safety-expl] and [safety-expl-addition] - tmp = unsafe { x.add(i).replace(tmp) }; - if i >= left { - i -= left; - if i == start { - // SAFETY: see [long-safety-expl] and [safety-expl-addition] - unsafe { x.add(start).write(tmp) }; - break; - } - } else { - i += right; - } - } - } - return; - // `T` is not a zero-sized type, so it's okay to divide by its size. - } else if !cfg!(feature = "optimize_for_size") - && cmp::min(left, right) <= mem::size_of::() / mem::size_of::() - { - // Algorithm 2 - // The `[T; 0]` here is to ensure this is appropriately aligned for T - let mut rawarray = MaybeUninit::<(BufType, [T; 0])>::uninit(); - let buf = rawarray.as_mut_ptr() as *mut T; - // SAFETY: `mid-left <= mid-left+right < mid+right` - let dim = unsafe { mid.sub(left).add(right) }; - if left <= right { - // SAFETY: - // - // 1) The `else if` condition about the sizes ensures `[mid-left; left]` will fit in - // `buf` without overflow and `buf` was created just above and so cannot be - // overlapped with any value of `[mid-left; left]` - // 2) [mid-left, mid+right) are all valid for reading and writing and we don't care - // about overlaps here. - // 3) The `if` condition about `left <= right` ensures writing `left` elements to - // `dim = mid-left+right` is valid because: - // - `buf` is valid and `left` elements were written in it in 1) - // - `dim+left = mid-left+right+left = mid+right` and we write `[dim, dim+left)` - unsafe { - // 1) - ptr::copy_nonoverlapping(mid.sub(left), buf, left); - // 2) - ptr::copy(mid, mid.sub(left), right); - // 3) - ptr::copy_nonoverlapping(buf, dim, left); - } - } else { - // SAFETY: same reasoning as above but with `left` and `right` reversed - unsafe { - ptr::copy_nonoverlapping(mid, buf, right); - ptr::copy(mid.sub(left), dim, left); - ptr::copy_nonoverlapping(buf, mid.sub(left), right); - } - } - return; - } else if left >= right { + if left >= right { // Algorithm 3 // There is an alternate way of swapping that involves finding where the last swap // of this algorithm would be, and swapping using that last chunk instead of swapping @@ -233,5 +260,8 @@ pub(super) unsafe fn ptr_rotate(mut left: usize, mut mid: *mut T, mut right: } } } + if (right == 0) || (left == 0) { + return; + } } } diff --git a/tests/codegen/lib-optimizations/slice_rotate.rs b/tests/codegen/lib-optimizations/slice_rotate.rs new file mode 100644 index 0000000000000..d0a7b328d1845 --- /dev/null +++ b/tests/codegen/lib-optimizations/slice_rotate.rs @@ -0,0 +1,30 @@ +//@ compile-flags: -O + +#![crate_type = "lib"] + +// Ensure that the simple case of rotating by a constant 1 optimizes to the obvious thing + +// CHECK-LABEL: @rotate_left_by_one +#[no_mangle] +pub fn rotate_left_by_one(slice: &mut [i32]) { + // CHECK-NOT: phi + // CHECK-NOT: call + // CHECK-NOT: load + // CHECK-NOT: store + // CHECK-NOT: getelementptr + // CHECK: %[[END:.+]] = getelementptr + // CHECK-NEXT: %[[DIM:.+]] = getelementptr + // CHECK-NEXT: %[[LAST:.+]] = load + // CHECK-NEXT: %[[FIRST:.+]] = shl + // CHECK-NEXT: call void @llvm.memmove + // CHECK-NEXT: store i32 %[[LAST]], ptr %[[DIM:.+]] + // CHECK-NOT: phi + // CHECK-NOT: call + // CHECK-NOT: load + // CHECK-NOT: store + // CHECK-NOT: getelementptr + // CHECK: ret void + if !slice.is_empty() { + slice.rotate_left(1); + } +}