mirror of
https://github.com/rust-lang/rust.git
synced 2026-05-08 01:28:18 +03:00
Replace sort implementations
- `slice::sort` -> driftsort https://github.com/Voultapher/sort-research-rs/blob/main/writeup/driftsort_introduction/text.md - `slice::sort_unstable` -> ipnsort https://github.com/Voultapher/sort-research-rs/blob/main/writeup/ipnsort_introduction/text.md Replaces the sort implementations with tailor made ones that strike a balance of run-time, compile-time and binary-size, yielding run-time and compile-time improvements. Regressing binary-size for `slice::sort` while improving it for `slice::sort_unstable`. All while upholding the existing soft and hard safety guarantees, and even extending the soft guarantees, detecting strict weak ordering violations with a high chance and reporting it to users via a panic. In addition the implementation of `select_nth_unstable` is also adapted as it uses `slice::sort_unstable` internals.
This commit is contained in:
+104
-111
@@ -16,7 +16,7 @@
|
||||
#[cfg(not(no_global_oom_handling))]
|
||||
use core::cmp::Ordering::{self, Less};
|
||||
#[cfg(not(no_global_oom_handling))]
|
||||
use core::mem::{self, SizedTypeProperties};
|
||||
use core::mem::{self, MaybeUninit};
|
||||
#[cfg(not(no_global_oom_handling))]
|
||||
use core::ptr;
|
||||
#[cfg(not(no_global_oom_handling))]
|
||||
@@ -24,7 +24,7 @@
|
||||
|
||||
use crate::alloc::Allocator;
|
||||
#[cfg(not(no_global_oom_handling))]
|
||||
use crate::alloc::{self, Global};
|
||||
use crate::alloc::Global;
|
||||
#[cfg(not(no_global_oom_handling))]
|
||||
use crate::borrow::ToOwned;
|
||||
use crate::boxed::Box;
|
||||
@@ -174,23 +174,32 @@ fn to_vec<A: Allocator>(s: &[Self], alloc: A) -> Vec<Self, A> {
|
||||
|
||||
#[cfg(not(test))]
|
||||
impl<T> [T] {
|
||||
/// Sorts the slice.
|
||||
/// Sorts the slice, preserving initial order of equal elements.
|
||||
///
|
||||
/// This sort is stable (i.e., does not reorder equal elements) and *O*(*n* \* log(*n*)) worst-case.
|
||||
/// This sort is stable (i.e., does not reorder equal elements) and *O*(*n* \* log(*n*))
|
||||
/// worst-case.
|
||||
///
|
||||
/// If `T: Ord` does not implement a total order the resulting order is unspecified. All
|
||||
/// original elements will remain in the slice and any possible modifications via interior
|
||||
/// mutability are observed in the input. Same is true if `T: Ord` panics.
|
||||
///
|
||||
/// When applicable, unstable sorting is preferred because it is generally faster than stable
|
||||
/// sorting and it doesn't allocate auxiliary memory.
|
||||
/// See [`sort_unstable`](slice::sort_unstable).
|
||||
/// sorting and it doesn't allocate auxiliary memory. See
|
||||
/// [`sort_unstable`](slice::sort_unstable). The exception are partially sorted slices, which
|
||||
/// may be better served with `slice::sort`.
|
||||
///
|
||||
/// # Current implementation
|
||||
///
|
||||
/// The current algorithm is an adaptive, iterative merge sort inspired by
|
||||
/// [timsort](https://en.wikipedia.org/wiki/Timsort).
|
||||
/// It is designed to be very fast in cases where the slice is nearly sorted, or consists of
|
||||
/// two or more sorted sequences concatenated one after another.
|
||||
/// The current implementation is based on [driftsort] by Orson Peters and Lukas Bergdoll, which
|
||||
/// combines the fast average case of quicksort with the fast worst case and partial run
|
||||
/// detection of mergesort, achieving linear time on fully sorted and reversed inputs. On inputs
|
||||
/// with k distinct elements, the expected time to sort the data is *O(*n* log(*k*))*.
|
||||
///
|
||||
/// Also, it allocates temporary storage half the size of `self`, but for short slices a
|
||||
/// non-allocating insertion sort is used instead.
|
||||
/// The auxiliary memory allocation behavior depends on the input length. Short slices are
|
||||
/// handled without allocation, medium sized slices allocate `self.len()` and beyond that it
|
||||
/// clamps at `self.len() / 2`.
|
||||
///
|
||||
/// If `T: Ord` does not implement a total order, the implementation may panic.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
@@ -200,6 +209,8 @@ impl<T> [T] {
|
||||
/// v.sort();
|
||||
/// assert!(v == [-5, -3, 1, 2, 4]);
|
||||
/// ```
|
||||
///
|
||||
/// [driftsort]: https://github.com/Voultapher/driftsort
|
||||
#[cfg(not(no_global_oom_handling))]
|
||||
#[rustc_allow_incoherent_impl]
|
||||
#[stable(feature = "rust1", since = "1.0.0")]
|
||||
@@ -211,13 +222,18 @@ pub fn sort(&mut self)
|
||||
stable_sort(self, T::lt);
|
||||
}
|
||||
|
||||
/// Sorts the slice with a comparator function.
|
||||
/// Sorts the slice with a comparator function, preserving initial order of equal elements.
|
||||
///
|
||||
/// This sort is stable (i.e., does not reorder equal elements) and *O*(*n* \* log(*n*)) worst-case.
|
||||
/// This sort is stable (i.e., does not reorder equal elements) and *O*(*n* \* log(*n*))
|
||||
/// worst-case.
|
||||
///
|
||||
/// The comparator function must define a total ordering for the elements in the slice. If
|
||||
/// the ordering is not total, the order of the elements is unspecified. An order is a
|
||||
/// total order if it is (for all `a`, `b` and `c`):
|
||||
/// The comparator function should define a total ordering for the elements in the slice. If the
|
||||
/// ordering is not total, the order of the elements is unspecified.
|
||||
///
|
||||
/// If the comparator function does not implement a total order the resulting order is
|
||||
/// unspecified. All original elements will remain in the slice and any possible modifications
|
||||
/// via interior mutability are observed in the input. Same is true if the comparator function
|
||||
/// panics. A total order (for all `a`, `b` and `c`):
|
||||
///
|
||||
/// * total and antisymmetric: exactly one of `a < b`, `a == b` or `a > b` is true, and
|
||||
/// * transitive, `a < b` and `b < c` implies `a < c`. The same must hold for both `==` and `>`.
|
||||
@@ -227,23 +243,22 @@ pub fn sort(&mut self)
|
||||
///
|
||||
/// ```
|
||||
/// let mut floats = [5f64, 4.0, 1.0, 3.0, 2.0];
|
||||
/// floats.sort_by(|a, b| a.partial_cmp(b).unwrap());
|
||||
/// floats.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap());
|
||||
/// assert_eq!(floats, [1.0, 2.0, 3.0, 4.0, 5.0]);
|
||||
/// ```
|
||||
///
|
||||
/// When applicable, unstable sorting is preferred because it is generally faster than stable
|
||||
/// sorting and it doesn't allocate auxiliary memory.
|
||||
/// See [`sort_unstable_by`](slice::sort_unstable_by).
|
||||
///
|
||||
/// # Current implementation
|
||||
///
|
||||
/// The current algorithm is an adaptive, iterative merge sort inspired by
|
||||
/// [timsort](https://en.wikipedia.org/wiki/Timsort).
|
||||
/// It is designed to be very fast in cases where the slice is nearly sorted, or consists of
|
||||
/// two or more sorted sequences concatenated one after another.
|
||||
/// The current implementation is based on [driftsort] by Orson Peters and Lukas Bergdoll, which
|
||||
/// combines the fast average case of quicksort with the fast worst case and partial run
|
||||
/// detection of mergesort, achieving linear time on fully sorted and reversed inputs. On inputs
|
||||
/// with k distinct elements, the expected time to sort the data is *O(*n* log(*k*))*.
|
||||
///
|
||||
/// Also, it allocates temporary storage half the size of `self`, but for short slices a
|
||||
/// non-allocating insertion sort is used instead.
|
||||
/// The auxiliary memory allocation behavior depends on the input length. Short slices are
|
||||
/// handled without allocation, medium sized slices allocate `self.len()` and beyond that it
|
||||
/// clamps at `self.len() / 2`.
|
||||
///
|
||||
/// If `T: Ord` does not implement a total order, the implementation may panic.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
@@ -256,6 +271,8 @@ pub fn sort(&mut self)
|
||||
/// v.sort_by(|a, b| b.cmp(a));
|
||||
/// assert!(v == [5, 4, 3, 2, 1]);
|
||||
/// ```
|
||||
///
|
||||
/// [driftsort]: https://github.com/Voultapher/driftsort
|
||||
#[cfg(not(no_global_oom_handling))]
|
||||
#[rustc_allow_incoherent_impl]
|
||||
#[stable(feature = "rust1", since = "1.0.0")]
|
||||
@@ -267,28 +284,27 @@ pub fn sort_by<F>(&mut self, mut compare: F)
|
||||
stable_sort(self, |a, b| compare(a, b) == Less);
|
||||
}
|
||||
|
||||
/// Sorts the slice with a key extraction function.
|
||||
/// Sorts the slice with a key extraction function, preserving initial order of equal elements.
|
||||
///
|
||||
/// This sort is stable (i.e., does not reorder equal elements) and *O*(*m* \* *n* \* log(*n*))
|
||||
/// worst-case, where the key function is *O*(*m*).
|
||||
///
|
||||
/// For expensive key functions (e.g. functions that are not simple property accesses or
|
||||
/// basic operations), [`sort_by_cached_key`](slice::sort_by_cached_key) is likely to be
|
||||
/// significantly faster, as it does not recompute element keys.
|
||||
///
|
||||
/// When applicable, unstable sorting is preferred because it is generally faster than stable
|
||||
/// sorting and it doesn't allocate auxiliary memory.
|
||||
/// See [`sort_unstable_by_key`](slice::sort_unstable_by_key).
|
||||
/// If `K: Ord` does not implement a total order the resulting order is unspecified.
|
||||
/// All original elements will remain in the slice and any possible modifications via interior
|
||||
/// mutability are observed in the input. Same is true if `K: Ord` panics.
|
||||
///
|
||||
/// # Current implementation
|
||||
///
|
||||
/// The current algorithm is an adaptive, iterative merge sort inspired by
|
||||
/// [timsort](https://en.wikipedia.org/wiki/Timsort).
|
||||
/// It is designed to be very fast in cases where the slice is nearly sorted, or consists of
|
||||
/// two or more sorted sequences concatenated one after another.
|
||||
/// The current implementation is based on [driftsort] by Orson Peters and Lukas Bergdoll, which
|
||||
/// combines the fast average case of quicksort with the fast worst case and partial run
|
||||
/// detection of mergesort, achieving linear time on fully sorted and reversed inputs. On inputs
|
||||
/// with k distinct elements, the expected time to sort the data is *O(*n* log(*k*))*.
|
||||
///
|
||||
/// Also, it allocates temporary storage half the size of `self`, but for short slices a
|
||||
/// non-allocating insertion sort is used instead.
|
||||
/// The auxiliary memory allocation behavior depends on the input length. Short slices are
|
||||
/// handled without allocation, medium sized slices allocate `self.len()` and beyond that it
|
||||
/// clamps at `self.len() / 2`.
|
||||
///
|
||||
/// If `K: Ord` does not implement a total order, the implementation may panic.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
@@ -298,6 +314,8 @@ pub fn sort_by<F>(&mut self, mut compare: F)
|
||||
/// v.sort_by_key(|k| k.abs());
|
||||
/// assert!(v == [1, 2, -3, 4, -5]);
|
||||
/// ```
|
||||
///
|
||||
/// [driftsort]: https://github.com/Voultapher/driftsort
|
||||
#[cfg(not(no_global_oom_handling))]
|
||||
#[rustc_allow_incoherent_impl]
|
||||
#[stable(feature = "slice_sort_by_key", since = "1.7.0")]
|
||||
@@ -310,27 +328,30 @@ pub fn sort_by_key<K, F>(&mut self, mut f: F)
|
||||
stable_sort(self, |a, b| f(a).lt(&f(b)));
|
||||
}
|
||||
|
||||
/// Sorts the slice with a key extraction function.
|
||||
/// Sorts the slice with a key extraction function, preserving initial order of equal elements.
|
||||
///
|
||||
/// During sorting, the key function is called at most once per element, by using
|
||||
/// temporary storage to remember the results of key evaluation.
|
||||
/// The order of calls to the key function is unspecified and may change in future versions
|
||||
/// of the standard library.
|
||||
/// This sort is stable (i.e., does not reorder equal elements) and *O*(*m* \* *n* + *n* \*
|
||||
/// log(*n*)) worst-case, where the key function is *O*(*m*).
|
||||
///
|
||||
/// This sort is stable (i.e., does not reorder equal elements) and *O*(*m* \* *n* + *n* \* log(*n*))
|
||||
/// worst-case, where the key function is *O*(*m*).
|
||||
/// During sorting, the key function is called at most once per element, by using temporary
|
||||
/// storage to remember the results of key evaluation. The order of calls to the key function is
|
||||
/// unspecified and may change in future versions of the standard library.
|
||||
///
|
||||
/// For simple key functions (e.g., functions that are property accesses or
|
||||
/// basic operations), [`sort_by_key`](slice::sort_by_key) is likely to be
|
||||
/// faster.
|
||||
/// If `K: Ord` does not implement a total order the resulting order is unspecified.
|
||||
/// All original elements will remain in the slice and any possible modifications via interior
|
||||
/// mutability are observed in the input. Same is true if `K: Ord` panics.
|
||||
///
|
||||
/// For simple key functions (e.g., functions that are property accesses or basic operations),
|
||||
/// [`sort_by_key`](slice::sort_by_key) is likely to be faster.
|
||||
///
|
||||
/// # Current implementation
|
||||
///
|
||||
/// The current algorithm is based on [pattern-defeating quicksort][pdqsort] by Orson Peters,
|
||||
/// which combines the fast average case of randomized quicksort with the fast worst case of
|
||||
/// heapsort, while achieving linear time on slices with certain patterns. It uses some
|
||||
/// randomization to avoid degenerate cases, but with a fixed seed to always provide
|
||||
/// deterministic behavior.
|
||||
/// The current implementation is based on [instruction-parallel-network sort][ipnsort] by Lukas
|
||||
/// Bergdoll, which combines the fast average case of randomized quicksort with the fast worst
|
||||
/// case of heapsort, while achieving linear time on fully sorted and reversed inputs. And
|
||||
/// *O*(*k* \* log(*n*)) where *k* is the number of distinct elements in the input. It leverages
|
||||
/// superscalar out-of-order execution capabilities commonly found in CPUs, to efficiently
|
||||
/// perform the operation.
|
||||
///
|
||||
/// In the worst case, the algorithm allocates temporary storage in a `Vec<(K, usize)>` the
|
||||
/// length of the slice.
|
||||
@@ -344,7 +365,7 @@ pub fn sort_by_key<K, F>(&mut self, mut f: F)
|
||||
/// assert!(v == [-3, -5, 2, 32, 4]);
|
||||
/// ```
|
||||
///
|
||||
/// [pdqsort]: https://github.com/orlp/pdqsort
|
||||
/// [ipnsort]: https://github.com/Voultapher/sort-research-rs/tree/main/ipnsort
|
||||
#[cfg(not(no_global_oom_handling))]
|
||||
#[rustc_allow_incoherent_impl]
|
||||
#[stable(feature = "slice_sort_by_cached_key", since = "1.34.0")]
|
||||
@@ -361,7 +382,7 @@ macro_rules! sort_by_key {
|
||||
$slice.iter().map($f).enumerate().map(|(i, k)| (k, i as $t)).collect();
|
||||
// The elements of `indices` are unique, as they are indexed, so any sort will be
|
||||
// stable with respect to the original slice. We use `sort_unstable` here because
|
||||
// it requires less memory allocation.
|
||||
// it requires no memory allocation.
|
||||
indices.sort_unstable();
|
||||
for i in 0..$slice.len() {
|
||||
let mut index = indices[i].1;
|
||||
@@ -374,24 +395,24 @@ macro_rules! sort_by_key {
|
||||
}};
|
||||
}
|
||||
|
||||
let sz_u8 = mem::size_of::<(K, u8)>();
|
||||
let sz_u16 = mem::size_of::<(K, u16)>();
|
||||
let sz_u32 = mem::size_of::<(K, u32)>();
|
||||
let sz_usize = mem::size_of::<(K, usize)>();
|
||||
|
||||
let len = self.len();
|
||||
if len < 2 {
|
||||
return;
|
||||
}
|
||||
if sz_u8 < sz_u16 && len <= (u8::MAX as usize) {
|
||||
return sort_by_key!(u8, self, f);
|
||||
}
|
||||
if sz_u16 < sz_u32 && len <= (u16::MAX as usize) {
|
||||
return sort_by_key!(u16, self, f);
|
||||
}
|
||||
if sz_u32 < sz_usize && len <= (u32::MAX as usize) {
|
||||
|
||||
// Avoids binary-size usage in cases where the alignment doesn't work out to make this
|
||||
// beneficial or on 32-bit platforms.
|
||||
let is_using_u32_as_idx_type_helpful =
|
||||
const { mem::size_of::<(K, u32)>() < mem::size_of::<(K, usize)>() };
|
||||
|
||||
// It's possible to instantiate this for u8 and u16 but, doing so is very wasteful in terms
|
||||
// of compile-times and binary-size, the peak saved heap memory for u16 is (u8 + u16) -> 4
|
||||
// bytes * u16::MAX vs (u8 + u32) -> 8 bytes * u16::MAX, the saved heap memory is at peak
|
||||
// ~262KB.
|
||||
if is_using_u32_as_idx_type_helpful && len <= (u32::MAX as usize) {
|
||||
return sort_by_key!(u32, self, f);
|
||||
}
|
||||
|
||||
sort_by_key!(usize, self, f)
|
||||
}
|
||||
|
||||
@@ -843,46 +864,18 @@ fn stable_sort<T, F>(v: &mut [T], mut is_less: F)
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
if T::IS_ZST {
|
||||
// Sorting has no meaningful behavior on zero-sized types. Do nothing.
|
||||
return;
|
||||
use sort::stable::BufGuard;
|
||||
|
||||
#[unstable(issue = "none", feature = "std_internals")]
|
||||
impl<T> BufGuard<T> for Vec<T> {
|
||||
fn with_capacity(capacity: usize) -> Self {
|
||||
Vec::with_capacity(capacity)
|
||||
}
|
||||
|
||||
fn as_uninit_slice_mut(&mut self) -> &mut [MaybeUninit<T>] {
|
||||
self.spare_capacity_mut()
|
||||
}
|
||||
}
|
||||
|
||||
let elem_alloc_fn = |len: usize| -> *mut T {
|
||||
// SAFETY: Creating the layout is safe as long as merge_sort never calls this with len >
|
||||
// v.len(). Alloc in general will only be used as 'shadow-region' to store temporary swap
|
||||
// elements.
|
||||
unsafe { alloc::alloc(alloc::Layout::array::<T>(len).unwrap_unchecked()) as *mut T }
|
||||
};
|
||||
|
||||
let elem_dealloc_fn = |buf_ptr: *mut T, len: usize| {
|
||||
// SAFETY: Creating the layout is safe as long as merge_sort never calls this with len >
|
||||
// v.len(). The caller must ensure that buf_ptr was created by elem_alloc_fn with the same
|
||||
// len.
|
||||
unsafe {
|
||||
alloc::dealloc(buf_ptr as *mut u8, alloc::Layout::array::<T>(len).unwrap_unchecked());
|
||||
}
|
||||
};
|
||||
|
||||
let run_alloc_fn = |len: usize| -> *mut sort::TimSortRun {
|
||||
// SAFETY: Creating the layout is safe as long as merge_sort never calls this with an
|
||||
// obscene length or 0.
|
||||
unsafe {
|
||||
alloc::alloc(alloc::Layout::array::<sort::TimSortRun>(len).unwrap_unchecked())
|
||||
as *mut sort::TimSortRun
|
||||
}
|
||||
};
|
||||
|
||||
let run_dealloc_fn = |buf_ptr: *mut sort::TimSortRun, len: usize| {
|
||||
// SAFETY: The caller must ensure that buf_ptr was created by elem_alloc_fn with the same
|
||||
// len.
|
||||
unsafe {
|
||||
alloc::dealloc(
|
||||
buf_ptr as *mut u8,
|
||||
alloc::Layout::array::<sort::TimSortRun>(len).unwrap_unchecked(),
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
sort::merge_sort(v, &mut is_less, elem_alloc_fn, elem_dealloc_fn, run_alloc_fn, run_dealloc_fn);
|
||||
sort::stable::sort::<T, F, Vec<T>>(v, &mut is_less);
|
||||
}
|
||||
|
||||
@@ -34,7 +34,7 @@ macro_rules! do_test {
|
||||
}
|
||||
|
||||
let v = $input.to_owned();
|
||||
let _ = std::panic::catch_unwind(move || {
|
||||
let _ = panic::catch_unwind(move || {
|
||||
let mut v = v;
|
||||
let mut panic_countdown = panic_countdown;
|
||||
v.$func(|a, b| {
|
||||
@@ -197,8 +197,7 @@ fn panic_safe() {
|
||||
|
||||
let mut rng = test_rng();
|
||||
|
||||
// Miri is too slow (but still need to `chain` to make the types match)
|
||||
let lens = if cfg!(miri) { (1..10).chain(0..0) } else { (1..20).chain(70..MAX_LEN) };
|
||||
let lens = if cfg!(miri) { (1..10).chain(30..36) } else { (1..20).chain(70..MAX_LEN) };
|
||||
let moduli: &[u32] = if cfg!(miri) { &[5] } else { &[5, 20, 50] };
|
||||
|
||||
for len in lens {
|
||||
@@ -294,15 +293,20 @@ fn test_sort() {
|
||||
}
|
||||
}
|
||||
|
||||
// Sort using a completely random comparison function.
|
||||
// This will reorder the elements *somehow*, but won't panic.
|
||||
let mut v = [0; 500];
|
||||
for i in 0..v.len() {
|
||||
const ORD_VIOLATION_MAX_LEN: usize = 500;
|
||||
let mut v = [0; ORD_VIOLATION_MAX_LEN];
|
||||
for i in 0..ORD_VIOLATION_MAX_LEN {
|
||||
v[i] = i as i32;
|
||||
}
|
||||
v.sort_by(|_, _| *[Less, Equal, Greater].choose(&mut rng).unwrap());
|
||||
|
||||
// Sort using a completely random comparison function. This will reorder the elements *somehow*,
|
||||
// it may panic but the original elements must still be present.
|
||||
let _ = panic::catch_unwind(move || {
|
||||
v.sort_by(|_, _| *[Less, Equal, Greater].choose(&mut rng).unwrap());
|
||||
});
|
||||
|
||||
v.sort();
|
||||
for i in 0..v.len() {
|
||||
for i in 0..ORD_VIOLATION_MAX_LEN {
|
||||
assert_eq!(v[i], i as i32);
|
||||
}
|
||||
|
||||
|
||||
+109
-84
@@ -39,7 +39,6 @@
|
||||
mod iter;
|
||||
mod raw;
|
||||
mod rotate;
|
||||
mod select;
|
||||
mod specialize;
|
||||
|
||||
#[unstable(feature = "str_internals", issue = "none")]
|
||||
@@ -83,10 +82,6 @@
|
||||
#[unstable(feature = "slice_from_ptr_range", issue = "89792")]
|
||||
pub use raw::{from_mut_ptr_range, from_ptr_range};
|
||||
|
||||
// This function is public only because there is no other way to unit test heapsort.
|
||||
#[unstable(feature = "sort_internals", reason = "internal to sort module", issue = "none")]
|
||||
pub use sort::heapsort;
|
||||
|
||||
#[stable(feature = "slice_get_slice", since = "1.28.0")]
|
||||
pub use index::SliceIndex;
|
||||
|
||||
@@ -2884,21 +2879,26 @@ pub fn binary_search_by_key<'a, B, F>(&'a self, b: &B, mut f: F) -> Result<usize
|
||||
self.binary_search_by(|k| f(k).cmp(b))
|
||||
}
|
||||
|
||||
/// Sorts the slice, but might not preserve the order of equal elements.
|
||||
/// Sorts the slice **without** preserving the initial order of equal elements.
|
||||
///
|
||||
/// This sort is unstable (i.e., may reorder equal elements), in-place
|
||||
/// (i.e., does not allocate), and *O*(*n* \* log(*n*)) worst-case.
|
||||
/// This sort is unstable (i.e., may reorder equal elements), in-place (i.e., does not
|
||||
/// allocate), and *O*(*n* \* log(*n*)) worst-case.
|
||||
///
|
||||
/// If `T: Ord` does not implement a total order the resulting order is unspecified. All
|
||||
/// original elements will remain in the slice and any possible modifications via interior
|
||||
/// mutability are observed in the input. Same is true if `T: Ord` panics.
|
||||
///
|
||||
/// # Current implementation
|
||||
///
|
||||
/// The current algorithm is based on [pattern-defeating quicksort][pdqsort] by Orson Peters,
|
||||
/// which combines the fast average case of randomized quicksort with the fast worst case of
|
||||
/// heapsort, while achieving linear time on slices with certain patterns. It uses some
|
||||
/// randomization to avoid degenerate cases, but with a fixed seed to always provide
|
||||
/// deterministic behavior.
|
||||
/// The current implementation is based on [ipnsort] by Lukas Bergdoll and Orson Peters, which
|
||||
/// combines the fast average case of quicksort with the fast worst case of heapsort, achieving
|
||||
/// linear time on fully sorted and reversed inputs. On inputs with k distinct elements, the
|
||||
/// expected time to sort the data is *O(*n* log(*k*))*.
|
||||
///
|
||||
/// It is typically faster than stable sorting, except in a few special cases, e.g., when the
|
||||
/// slice consists of several concatenated sorted sequences.
|
||||
/// slice is partially sorted.
|
||||
///
|
||||
/// If `T: Ord` does not implement a total order, the implementation may panic.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
@@ -2909,25 +2909,29 @@ pub fn binary_search_by_key<'a, B, F>(&'a self, b: &B, mut f: F) -> Result<usize
|
||||
/// assert!(v == [-5, -3, 1, 2, 4]);
|
||||
/// ```
|
||||
///
|
||||
/// [pdqsort]: https://github.com/orlp/pdqsort
|
||||
/// [ipnsort]: https://github.com/Voultapher/sort-research-rs/tree/main/ipnsort
|
||||
#[stable(feature = "sort_unstable", since = "1.20.0")]
|
||||
#[inline]
|
||||
pub fn sort_unstable(&mut self)
|
||||
where
|
||||
T: Ord,
|
||||
{
|
||||
sort::quicksort(self, T::lt);
|
||||
sort::unstable::sort(self, &mut T::lt);
|
||||
}
|
||||
|
||||
/// Sorts the slice with a comparator function, but might not preserve the order of equal
|
||||
/// elements.
|
||||
/// Sorts the slice with a comparator function, **without** preserving the initial order of
|
||||
/// equal elements.
|
||||
///
|
||||
/// This sort is unstable (i.e., may reorder equal elements), in-place
|
||||
/// (i.e., does not allocate), and *O*(*n* \* log(*n*)) worst-case.
|
||||
/// This sort is unstable (i.e., may reorder equal elements), in-place (i.e., does not
|
||||
/// allocate), and *O*(*n* \* log(*n*)) worst-case.
|
||||
///
|
||||
/// The comparator function must define a total ordering for the elements in the slice. If
|
||||
/// the ordering is not total, the order of the elements is unspecified. An order is a
|
||||
/// total order if it is (for all `a`, `b` and `c`):
|
||||
/// The comparator function should define a total ordering for the elements in the slice. If the
|
||||
/// ordering is not total, the order of the elements is unspecified.
|
||||
///
|
||||
/// If the comparator function does not implement a total order the resulting order is
|
||||
/// unspecified. All original elements will remain in the slice and any possible modifications
|
||||
/// via interior mutability are observed in the input. Same is true if the comparator function
|
||||
/// panics. A total order (for all `a`, `b` and `c`):
|
||||
///
|
||||
/// * total and antisymmetric: exactly one of `a < b`, `a == b` or `a > b` is true, and
|
||||
/// * transitive, `a < b` and `b < c` implies `a < c`. The same must hold for both `==` and `>`.
|
||||
@@ -2943,14 +2947,15 @@ pub fn sort_unstable(&mut self)
|
||||
///
|
||||
/// # Current implementation
|
||||
///
|
||||
/// The current algorithm is based on [pattern-defeating quicksort][pdqsort] by Orson Peters,
|
||||
/// which combines the fast average case of randomized quicksort with the fast worst case of
|
||||
/// heapsort, while achieving linear time on slices with certain patterns. It uses some
|
||||
/// randomization to avoid degenerate cases, but with a fixed seed to always provide
|
||||
/// deterministic behavior.
|
||||
/// The current implementation is based on [ipnsort] by Lukas Bergdoll and Orson Peters, which
|
||||
/// combines the fast average case of quicksort with the fast worst case of heapsort, achieving
|
||||
/// linear time on fully sorted and reversed inputs. On inputs with k distinct elements, the
|
||||
/// expected time to sort the data is *O(*n* log(*k*))*.
|
||||
///
|
||||
/// It is typically faster than stable sorting, except in a few special cases, e.g., when the
|
||||
/// slice consists of several concatenated sorted sequences.
|
||||
/// slice is partially sorted.
|
||||
///
|
||||
/// If `T: Ord` does not implement a total order, the implementation may panic.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
@@ -2964,34 +2969,37 @@ pub fn sort_unstable(&mut self)
|
||||
/// assert!(v == [5, 4, 3, 2, 1]);
|
||||
/// ```
|
||||
///
|
||||
/// [pdqsort]: https://github.com/orlp/pdqsort
|
||||
/// [ipnsort]: https://github.com/Voultapher/sort-research-rs/tree/main/ipnsort
|
||||
#[stable(feature = "sort_unstable", since = "1.20.0")]
|
||||
#[inline]
|
||||
pub fn sort_unstable_by<F>(&mut self, mut compare: F)
|
||||
where
|
||||
F: FnMut(&T, &T) -> Ordering,
|
||||
{
|
||||
sort::quicksort(self, |a, b| compare(a, b) == Ordering::Less);
|
||||
sort::unstable::sort(self, &mut |a, b| compare(a, b) == Ordering::Less);
|
||||
}
|
||||
|
||||
/// Sorts the slice with a key extraction function, but might not preserve the order of equal
|
||||
/// elements.
|
||||
/// Sorts the slice with a key extraction function, **without** preserving the initial order of
|
||||
/// equal elements.
|
||||
///
|
||||
/// This sort is unstable (i.e., may reorder equal elements), in-place
|
||||
/// (i.e., does not allocate), and *O*(*m* \* *n* \* log(*n*)) worst-case, where the key function is
|
||||
/// *O*(*m*).
|
||||
/// This sort is unstable (i.e., may reorder equal elements), in-place (i.e., does not
|
||||
/// allocate), and *O*(*n* \* log(*n*)) worst-case.
|
||||
///
|
||||
/// If `K: Ord` does not implement a total order the resulting order is unspecified.
|
||||
/// All original elements will remain in the slice and any possible modifications via interior
|
||||
/// mutability are observed in the input. Same is true if `K: Ord` panics.
|
||||
///
|
||||
/// # Current implementation
|
||||
///
|
||||
/// The current algorithm is based on [pattern-defeating quicksort][pdqsort] by Orson Peters,
|
||||
/// which combines the fast average case of randomized quicksort with the fast worst case of
|
||||
/// heapsort, while achieving linear time on slices with certain patterns. It uses some
|
||||
/// randomization to avoid degenerate cases, but with a fixed seed to always provide
|
||||
/// deterministic behavior.
|
||||
/// The current implementation is based on [ipnsort] by Lukas Bergdoll and Orson Peters, which
|
||||
/// combines the fast average case of quicksort with the fast worst case of heapsort, achieving
|
||||
/// linear time on fully sorted and reversed inputs. On inputs with k distinct elements, the
|
||||
/// expected time to sort the data is *O(*n* log(*k*))*.
|
||||
///
|
||||
/// Due to its key calling strategy, [`sort_unstable_by_key`](#method.sort_unstable_by_key)
|
||||
/// is likely to be slower than [`sort_by_cached_key`](#method.sort_by_cached_key) in
|
||||
/// cases where the key function is expensive.
|
||||
/// It is typically faster than stable sorting, except in a few special cases, e.g., when the
|
||||
/// slice is partially sorted.
|
||||
///
|
||||
/// If `K: Ord` does not implement a total order, the implementation may panic.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
@@ -3002,7 +3010,7 @@ pub fn sort_unstable_by<F>(&mut self, mut compare: F)
|
||||
/// assert!(v == [1, 2, -3, 4, -5]);
|
||||
/// ```
|
||||
///
|
||||
/// [pdqsort]: https://github.com/orlp/pdqsort
|
||||
/// [ipnsort]: https://github.com/Voultapher/sort-research-rs/tree/main/ipnsort
|
||||
#[stable(feature = "sort_unstable", since = "1.20.0")]
|
||||
#[inline]
|
||||
pub fn sort_unstable_by_key<K, F>(&mut self, mut f: F)
|
||||
@@ -3010,27 +3018,32 @@ pub fn sort_unstable_by_key<K, F>(&mut self, mut f: F)
|
||||
F: FnMut(&T) -> K,
|
||||
K: Ord,
|
||||
{
|
||||
sort::quicksort(self, |a, b| f(a).lt(&f(b)));
|
||||
sort::unstable::sort(self, &mut |a, b| f(a).lt(&f(b)));
|
||||
}
|
||||
|
||||
/// Reorder the slice such that the element at `index` after the reordering is at its final sorted position.
|
||||
/// Reorder the slice such that the element at `index` after the reordering is at its final
|
||||
/// sorted position.
|
||||
///
|
||||
/// This reordering has the additional property that any value at position `i < index` will be
|
||||
/// less than or equal to any value at a position `j > index`. Additionally, this reordering is
|
||||
/// unstable (i.e. any number of equal elements may end up at position `index`), in-place
|
||||
/// (i.e. does not allocate), and runs in *O*(*n*) time.
|
||||
/// This function is also known as "kth element" in other libraries.
|
||||
/// unstable (i.e. any number of equal elements may end up at position `index`), in-place (i.e.
|
||||
/// does not allocate), and runs in *O*(*n*) time. This function is also known as "kth element"
|
||||
/// in other libraries.
|
||||
///
|
||||
/// It returns a triplet of the following from the reordered slice:
|
||||
/// the subslice prior to `index`, the element at `index`, and the subslice after `index`;
|
||||
/// accordingly, the values in those two subslices will respectively all be less-than-or-equal-to
|
||||
/// and greater-than-or-equal-to the value of the element at `index`.
|
||||
/// It returns a triplet of the following from the reordered slice: the subslice prior to
|
||||
/// `index`, the element at `index`, and the subslice after `index`; accordingly, the values in
|
||||
/// those two subslices will respectively all be less-than-or-equal-to and
|
||||
/// greater-than-or-equal-to the value of the element at `index`.
|
||||
///
|
||||
/// # Current implementation
|
||||
///
|
||||
/// The current algorithm is an introselect implementation based on Pattern Defeating Quicksort, which is also
|
||||
/// the basis for [`sort_unstable`]. The fallback algorithm is Median of Medians using Tukey's Ninther for
|
||||
/// pivot selection, which guarantees linear runtime for all inputs.
|
||||
/// The current algorithm is an introselect implementation based on [ipnsort] by Lukas Bergdoll
|
||||
/// and Orson Peters, which is also the basis for [`sort_unstable`]. The fallback algorithm is
|
||||
/// Median of Medians using Tukey's Ninther for pivot selection, which guarantees linear runtime
|
||||
/// for all inputs.
|
||||
///
|
||||
/// It is typically faster than sorting, except in a few special cases, e.g., when the slice is
|
||||
/// nearly fully sorted, where [`slice::sort`] may be faster.
|
||||
///
|
||||
/// [`sort_unstable`]: slice::sort_unstable
|
||||
///
|
||||
@@ -3058,35 +3071,40 @@ pub fn sort_unstable_by_key<K, F>(&mut self, mut f: F)
|
||||
/// v == [-3, -5, 1, 4, 2] ||
|
||||
/// v == [-5, -3, 1, 4, 2]);
|
||||
/// ```
|
||||
///
|
||||
/// [ipnsort]: https://github.com/Voultapher/sort-research-rs/tree/main/ipnsort
|
||||
#[stable(feature = "slice_select_nth_unstable", since = "1.49.0")]
|
||||
#[inline]
|
||||
pub fn select_nth_unstable(&mut self, index: usize) -> (&mut [T], &mut T, &mut [T])
|
||||
where
|
||||
T: Ord,
|
||||
{
|
||||
select::partition_at_index(self, index, T::lt)
|
||||
sort::select::partition_at_index(self, index, T::lt)
|
||||
}
|
||||
|
||||
/// Reorder the slice with a comparator function such that the element at `index` after the reordering is at
|
||||
/// its final sorted position.
|
||||
/// Reorder the slice with a comparator function such that the element at `index` after the
|
||||
/// reordering is at its final sorted position.
|
||||
///
|
||||
/// This reordering has the additional property that any value at position `i < index` will be
|
||||
/// less than or equal to any value at a position `j > index` using the comparator function.
|
||||
/// Additionally, this reordering is unstable (i.e. any number of equal elements may end up at
|
||||
/// position `index`), in-place (i.e. does not allocate), and runs in *O*(*n*) time.
|
||||
/// This function is also known as "kth element" in other libraries.
|
||||
/// position `index`), in-place (i.e. does not allocate), and runs in *O*(*n*) time. This
|
||||
/// function is also known as "kth element" in other libraries.
|
||||
///
|
||||
/// It returns a triplet of the following from
|
||||
/// the slice reordered according to the provided comparator function: the subslice prior to
|
||||
/// `index`, the element at `index`, and the subslice after `index`; accordingly, the values in
|
||||
/// those two subslices will respectively all be less-than-or-equal-to and greater-than-or-equal-to
|
||||
/// the value of the element at `index`.
|
||||
/// It returns a triplet of the following from the slice reordered according to the provided
|
||||
/// comparator function: the subslice prior to `index`, the element at `index`, and the subslice
|
||||
/// after `index`; accordingly, the values in those two subslices will respectively all be
|
||||
/// less-than-or-equal-to and greater-than-or-equal-to the value of the element at `index`.
|
||||
///
|
||||
/// # Current implementation
|
||||
///
|
||||
/// The current algorithm is an introselect implementation based on Pattern Defeating Quicksort, which is also
|
||||
/// the basis for [`sort_unstable`]. The fallback algorithm is Median of Medians using Tukey's Ninther for
|
||||
/// pivot selection, which guarantees linear runtime for all inputs.
|
||||
/// The current algorithm is an introselect implementation based on [ipnsort] by Lukas Bergdoll
|
||||
/// and Orson Peters, which is also the basis for [`sort_unstable`]. The fallback algorithm is
|
||||
/// Median of Medians using Tukey's Ninther for pivot selection, which guarantees linear runtime
|
||||
/// for all inputs.
|
||||
///
|
||||
/// It is typically faster than sorting, except in a few special cases, e.g., when the slice is
|
||||
/// nearly fully sorted, where [`slice::sort`] may be faster.
|
||||
///
|
||||
/// [`sort_unstable`]: slice::sort_unstable
|
||||
///
|
||||
@@ -3114,6 +3132,8 @@ pub fn select_nth_unstable(&mut self, index: usize) -> (&mut [T], &mut T, &mut [
|
||||
/// v == [4, 2, 1, -5, -3] ||
|
||||
/// v == [4, 2, 1, -3, -5]);
|
||||
/// ```
|
||||
///
|
||||
/// [ipnsort]: https://github.com/Voultapher/sort-research-rs/tree/main/ipnsort
|
||||
#[stable(feature = "slice_select_nth_unstable", since = "1.49.0")]
|
||||
#[inline]
|
||||
pub fn select_nth_unstable_by<F>(
|
||||
@@ -3124,29 +3144,32 @@ pub fn select_nth_unstable_by<F>(
|
||||
where
|
||||
F: FnMut(&T, &T) -> Ordering,
|
||||
{
|
||||
select::partition_at_index(self, index, |a: &T, b: &T| compare(a, b) == Less)
|
||||
sort::select::partition_at_index(self, index, |a: &T, b: &T| compare(a, b) == Less)
|
||||
}
|
||||
|
||||
/// Reorder the slice with a key extraction function such that the element at `index` after the reordering is
|
||||
/// at its final sorted position.
|
||||
/// Reorder the slice with a key extraction function such that the element at `index` after the
|
||||
/// reordering is at its final sorted position.
|
||||
///
|
||||
/// This reordering has the additional property that any value at position `i < index` will be
|
||||
/// less than or equal to any value at a position `j > index` using the key extraction function.
|
||||
/// Additionally, this reordering is unstable (i.e. any number of equal elements may end up at
|
||||
/// position `index`), in-place (i.e. does not allocate), and runs in *O*(*n*) time.
|
||||
/// This function is also known as "kth element" in other libraries.
|
||||
/// position `index`), in-place (i.e. does not allocate), and runs in *O*(*n*) time. This
|
||||
/// function is also known as "kth element" in other libraries.
|
||||
///
|
||||
/// It returns a triplet of the following from
|
||||
/// the slice reordered according to the provided key extraction function: the subslice prior to
|
||||
/// `index`, the element at `index`, and the subslice after `index`; accordingly, the values in
|
||||
/// those two subslices will respectively all be less-than-or-equal-to and greater-than-or-equal-to
|
||||
/// the value of the element at `index`.
|
||||
/// It returns a triplet of the following from the slice reordered according to the provided key
|
||||
/// extraction function: the subslice prior to `index`, the element at `index`, and the subslice
|
||||
/// after `index`; accordingly, the values in those two subslices will respectively all be
|
||||
/// less-than-or-equal-to and greater-than-or-equal-to the value of the element at `index`.
|
||||
///
|
||||
/// # Current implementation
|
||||
///
|
||||
/// The current algorithm is an introselect implementation based on Pattern Defeating Quicksort, which is also
|
||||
/// the basis for [`sort_unstable`]. The fallback algorithm is Median of Medians using Tukey's Ninther for
|
||||
/// pivot selection, which guarantees linear runtime for all inputs.
|
||||
/// The current algorithm is an introselect implementation based on [ipnsort] by Lukas Bergdoll
|
||||
/// and Orson Peters, which is also the basis for [`sort_unstable`]. The fallback algorithm is
|
||||
/// Median of Medians using Tukey's Ninther for pivot selection, which guarantees linear runtime
|
||||
/// for all inputs.
|
||||
///
|
||||
/// It is typically faster than sorting, except in a few special cases, e.g., when the slice is
|
||||
/// nearly fully sorted, where [`slice::sort`] may be faster.
|
||||
///
|
||||
/// [`sort_unstable`]: slice::sort_unstable
|
||||
///
|
||||
@@ -3174,6 +3197,8 @@ pub fn select_nth_unstable_by<F>(
|
||||
/// v == [2, 1, -3, 4, -5] ||
|
||||
/// v == [2, 1, -3, -5, 4]);
|
||||
/// ```
|
||||
///
|
||||
/// [ipnsort]: https://github.com/Voultapher/sort-research-rs/tree/main/ipnsort
|
||||
#[stable(feature = "slice_select_nth_unstable", since = "1.49.0")]
|
||||
#[inline]
|
||||
pub fn select_nth_unstable_by_key<K, F>(
|
||||
@@ -3185,7 +3210,7 @@ pub fn select_nth_unstable_by_key<K, F>(
|
||||
F: FnMut(&T) -> K,
|
||||
K: Ord,
|
||||
{
|
||||
select::partition_at_index(self, index, |a: &T, b: &T| f(a).lt(&f(b)))
|
||||
sort::select::partition_at_index(self, index, |a: &T, b: &T| f(a).lt(&f(b)))
|
||||
}
|
||||
|
||||
/// Moves all consecutive repeated elements to the end of the slice according to the
|
||||
|
||||
@@ -1,1383 +0,0 @@
|
||||
//! Slice sorting
|
||||
//!
|
||||
//! This module contains a sorting algorithm based on Orson Peters' pattern-defeating quicksort,
|
||||
//! published at: <https://github.com/orlp/pdqsort>
|
||||
//!
|
||||
//! Unstable sorting is compatible with core because it doesn't allocate memory, unlike our
|
||||
//! stable sorting implementation.
|
||||
//!
|
||||
//! In addition it also contains the core logic of the stable sort used by `slice::sort` based on
|
||||
//! TimSort.
|
||||
|
||||
use crate::cmp;
|
||||
use crate::mem::{self, MaybeUninit, SizedTypeProperties};
|
||||
use crate::ptr;
|
||||
|
||||
// When dropped, copies from `src` into `dest`.
|
||||
struct InsertionHole<T> {
|
||||
src: *const T,
|
||||
dest: *mut T,
|
||||
}
|
||||
|
||||
impl<T> Drop for InsertionHole<T> {
|
||||
fn drop(&mut self) {
|
||||
// SAFETY: This is a helper class. Please refer to its usage for correctness. Namely, one
|
||||
// must be sure that `src` and `dst` does not overlap as required by
|
||||
// `ptr::copy_nonoverlapping` and are both valid for writes.
|
||||
unsafe {
|
||||
ptr::copy_nonoverlapping(self.src, self.dest, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Inserts `v[v.len() - 1]` into pre-sorted sequence `v[..v.len() - 1]` so that whole `v[..]`
|
||||
/// becomes sorted.
|
||||
unsafe fn insert_tail<T, F>(v: &mut [T], is_less: &mut F)
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
debug_assert!(v.len() >= 2);
|
||||
|
||||
let arr_ptr = v.as_mut_ptr();
|
||||
let i = v.len() - 1;
|
||||
|
||||
// SAFETY: caller must ensure v is at least len 2.
|
||||
unsafe {
|
||||
// See insert_head which talks about why this approach is beneficial.
|
||||
let i_ptr = arr_ptr.add(i);
|
||||
|
||||
// It's important that we use i_ptr here. If this check is positive and we continue,
|
||||
// We want to make sure that no other copy of the value was seen by is_less.
|
||||
// Otherwise we would have to copy it back.
|
||||
if is_less(&*i_ptr, &*i_ptr.sub(1)) {
|
||||
// It's important, that we use tmp for comparison from now on. As it is the value that
|
||||
// will be copied back. And notionally we could have created a divergence if we copy
|
||||
// back the wrong value.
|
||||
let tmp = mem::ManuallyDrop::new(ptr::read(i_ptr));
|
||||
// Intermediate state of the insertion process is always tracked by `hole`, which
|
||||
// serves two purposes:
|
||||
// 1. Protects integrity of `v` from panics in `is_less`.
|
||||
// 2. Fills the remaining hole in `v` in the end.
|
||||
//
|
||||
// Panic safety:
|
||||
//
|
||||
// If `is_less` panics at any point during the process, `hole` will get dropped and
|
||||
// fill the hole in `v` with `tmp`, thus ensuring that `v` still holds every object it
|
||||
// initially held exactly once.
|
||||
let mut hole = InsertionHole { src: &*tmp, dest: i_ptr.sub(1) };
|
||||
ptr::copy_nonoverlapping(hole.dest, i_ptr, 1);
|
||||
|
||||
// SAFETY: We know i is at least 1.
|
||||
for j in (0..(i - 1)).rev() {
|
||||
let j_ptr = arr_ptr.add(j);
|
||||
if !is_less(&*tmp, &*j_ptr) {
|
||||
break;
|
||||
}
|
||||
|
||||
ptr::copy_nonoverlapping(j_ptr, hole.dest, 1);
|
||||
hole.dest = j_ptr;
|
||||
}
|
||||
// `hole` gets dropped and thus copies `tmp` into the remaining hole in `v`.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Inserts `v[0]` into pre-sorted sequence `v[1..]` so that whole `v[..]` becomes sorted.
|
||||
///
|
||||
/// This is the integral subroutine of insertion sort.
|
||||
unsafe fn insert_head<T, F>(v: &mut [T], is_less: &mut F)
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
debug_assert!(v.len() >= 2);
|
||||
|
||||
// SAFETY: caller must ensure v is at least len 2.
|
||||
unsafe {
|
||||
if is_less(v.get_unchecked(1), v.get_unchecked(0)) {
|
||||
let arr_ptr = v.as_mut_ptr();
|
||||
|
||||
// There are three ways to implement insertion here:
|
||||
//
|
||||
// 1. Swap adjacent elements until the first one gets to its final destination.
|
||||
// However, this way we copy data around more than is necessary. If elements are big
|
||||
// structures (costly to copy), this method will be slow.
|
||||
//
|
||||
// 2. Iterate until the right place for the first element is found. Then shift the
|
||||
// elements succeeding it to make room for it and finally place it into the
|
||||
// remaining hole. This is a good method.
|
||||
//
|
||||
// 3. Copy the first element into a temporary variable. Iterate until the right place
|
||||
// for it is found. As we go along, copy every traversed element into the slot
|
||||
// preceding it. Finally, copy data from the temporary variable into the remaining
|
||||
// hole. This method is very good. Benchmarks demonstrated slightly better
|
||||
// performance than with the 2nd method.
|
||||
//
|
||||
// All methods were benchmarked, and the 3rd showed best results. So we chose that one.
|
||||
let tmp = mem::ManuallyDrop::new(ptr::read(arr_ptr));
|
||||
|
||||
// Intermediate state of the insertion process is always tracked by `hole`, which
|
||||
// serves two purposes:
|
||||
// 1. Protects integrity of `v` from panics in `is_less`.
|
||||
// 2. Fills the remaining hole in `v` in the end.
|
||||
//
|
||||
// Panic safety:
|
||||
//
|
||||
// If `is_less` panics at any point during the process, `hole` will get dropped and
|
||||
// fill the hole in `v` with `tmp`, thus ensuring that `v` still holds every object it
|
||||
// initially held exactly once.
|
||||
let mut hole = InsertionHole { src: &*tmp, dest: arr_ptr.add(1) };
|
||||
ptr::copy_nonoverlapping(arr_ptr.add(1), arr_ptr.add(0), 1);
|
||||
|
||||
for i in 2..v.len() {
|
||||
if !is_less(&v.get_unchecked(i), &*tmp) {
|
||||
break;
|
||||
}
|
||||
ptr::copy_nonoverlapping(arr_ptr.add(i), arr_ptr.add(i - 1), 1);
|
||||
hole.dest = arr_ptr.add(i);
|
||||
}
|
||||
// `hole` gets dropped and thus copies `tmp` into the remaining hole in `v`.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Sort `v` assuming `v[..offset]` is already sorted.
|
||||
///
|
||||
/// Never inline this function to avoid code bloat. It still optimizes nicely and has practically no
|
||||
/// performance impact. Even improving performance in some cases.
|
||||
#[inline(never)]
|
||||
pub(super) fn insertion_sort_shift_left<T, F>(v: &mut [T], offset: usize, is_less: &mut F)
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
let len = v.len();
|
||||
|
||||
// Using assert here improves performance.
|
||||
assert!(offset != 0 && offset <= len);
|
||||
|
||||
// Shift each element of the unsorted region v[i..] as far left as is needed to make v sorted.
|
||||
for i in offset..len {
|
||||
// SAFETY: we tested that `offset` must be at least 1, so this loop is only entered if len
|
||||
// >= 2. The range is exclusive and we know `i` must be at least 1 so this slice has at
|
||||
// >least len 2.
|
||||
unsafe {
|
||||
insert_tail(&mut v[..=i], is_less);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Sort `v` assuming `v[offset..]` is already sorted.
|
||||
///
|
||||
/// Never inline this function to avoid code bloat. It still optimizes nicely and has practically no
|
||||
/// performance impact. Even improving performance in some cases.
|
||||
#[inline(never)]
|
||||
fn insertion_sort_shift_right<T, F>(v: &mut [T], offset: usize, is_less: &mut F)
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
let len = v.len();
|
||||
|
||||
// Using assert here improves performance.
|
||||
assert!(offset != 0 && offset <= len && len >= 2);
|
||||
|
||||
// Shift each element of the unsorted region v[..i] as far left as is needed to make v sorted.
|
||||
for i in (0..offset).rev() {
|
||||
// SAFETY: we tested that `offset` must be at least 1, so this loop is only entered if len
|
||||
// >= 2.We ensured that the slice length is always at least 2 long. We know that start_found
|
||||
// will be at least one less than end, and the range is exclusive. Which gives us i always
|
||||
// <= (end - 2).
|
||||
unsafe {
|
||||
insert_head(&mut v[i..len], is_less);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Partially sorts a slice by shifting several out-of-order elements around.
|
||||
///
|
||||
/// Returns `true` if the slice is sorted at the end. This function is *O*(*n*) worst-case.
|
||||
#[cold]
|
||||
fn partial_insertion_sort<T, F>(v: &mut [T], is_less: &mut F) -> bool
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
// Maximum number of adjacent out-of-order pairs that will get shifted.
|
||||
const MAX_STEPS: usize = 5;
|
||||
// If the slice is shorter than this, don't shift any elements.
|
||||
const SHORTEST_SHIFTING: usize = 50;
|
||||
|
||||
let len = v.len();
|
||||
let mut i = 1;
|
||||
|
||||
for _ in 0..MAX_STEPS {
|
||||
// SAFETY: We already explicitly did the bound checking with `i < len`.
|
||||
// All our subsequent indexing is only in the range `0 <= index < len`
|
||||
unsafe {
|
||||
// Find the next pair of adjacent out-of-order elements.
|
||||
while i < len && !is_less(v.get_unchecked(i), v.get_unchecked(i - 1)) {
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Are we done?
|
||||
if i == len {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Don't shift elements on short arrays, that has a performance cost.
|
||||
if len < SHORTEST_SHIFTING {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Swap the found pair of elements. This puts them in correct order.
|
||||
v.swap(i - 1, i);
|
||||
|
||||
if i >= 2 {
|
||||
// Shift the smaller element to the left.
|
||||
insertion_sort_shift_left(&mut v[..i], i - 1, is_less);
|
||||
|
||||
// Shift the greater element to the right.
|
||||
insertion_sort_shift_right(&mut v[..i], 1, is_less);
|
||||
}
|
||||
}
|
||||
|
||||
// Didn't manage to sort the slice in the limited number of steps.
|
||||
false
|
||||
}
|
||||
|
||||
/// Sorts `v` using heapsort, which guarantees *O*(*n* \* log(*n*)) worst-case.
|
||||
#[cold]
|
||||
#[unstable(feature = "sort_internals", reason = "internal to sort module", issue = "none")]
|
||||
pub fn heapsort<T, F>(v: &mut [T], mut is_less: F)
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
// This binary heap respects the invariant `parent >= child`.
|
||||
let mut sift_down = |v: &mut [T], mut node| {
|
||||
loop {
|
||||
// Children of `node`.
|
||||
let mut child = 2 * node + 1;
|
||||
if child >= v.len() {
|
||||
break;
|
||||
}
|
||||
|
||||
// Choose the greater child.
|
||||
if child + 1 < v.len() {
|
||||
// We need a branch to be sure not to out-of-bounds index,
|
||||
// but it's highly predictable. The comparison, however,
|
||||
// is better done branchless, especially for primitives.
|
||||
child += is_less(&v[child], &v[child + 1]) as usize;
|
||||
}
|
||||
|
||||
// Stop if the invariant holds at `node`.
|
||||
if !is_less(&v[node], &v[child]) {
|
||||
break;
|
||||
}
|
||||
|
||||
// Swap `node` with the greater child, move one step down, and continue sifting.
|
||||
v.swap(node, child);
|
||||
node = child;
|
||||
}
|
||||
};
|
||||
|
||||
// Build the heap in linear time.
|
||||
for i in (0..v.len() / 2).rev() {
|
||||
sift_down(v, i);
|
||||
}
|
||||
|
||||
// Pop maximal elements from the heap.
|
||||
for i in (1..v.len()).rev() {
|
||||
v.swap(0, i);
|
||||
sift_down(&mut v[..i], 0);
|
||||
}
|
||||
}
|
||||
|
||||
/// Partitions `v` into elements smaller than `pivot`, followed by elements greater than or equal
|
||||
/// to `pivot`.
|
||||
///
|
||||
/// Returns the number of elements smaller than `pivot`.
|
||||
///
|
||||
/// Partitioning is performed block-by-block in order to minimize the cost of branching operations.
|
||||
/// This idea is presented in the [BlockQuicksort][pdf] paper.
|
||||
///
|
||||
/// [pdf]: https://drops.dagstuhl.de/opus/volltexte/2016/6389/pdf/LIPIcs-ESA-2016-38.pdf
|
||||
fn partition_in_blocks<T, F>(v: &mut [T], pivot: &T, is_less: &mut F) -> usize
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
// Number of elements in a typical block.
|
||||
const BLOCK: usize = 128;
|
||||
|
||||
// The partitioning algorithm repeats the following steps until completion:
|
||||
//
|
||||
// 1. Trace a block from the left side to identify elements greater than or equal to the pivot.
|
||||
// 2. Trace a block from the right side to identify elements smaller than the pivot.
|
||||
// 3. Exchange the identified elements between the left and right side.
|
||||
//
|
||||
// We keep the following variables for a block of elements:
|
||||
//
|
||||
// 1. `block` - Number of elements in the block.
|
||||
// 2. `start` - Start pointer into the `offsets` array.
|
||||
// 3. `end` - End pointer into the `offsets` array.
|
||||
// 4. `offsets` - Indices of out-of-order elements within the block.
|
||||
|
||||
// The current block on the left side (from `l` to `l.add(block_l)`).
|
||||
let mut l = v.as_mut_ptr();
|
||||
let mut block_l = BLOCK;
|
||||
let mut start_l = ptr::null_mut();
|
||||
let mut end_l = ptr::null_mut();
|
||||
let mut offsets_l = [MaybeUninit::<u8>::uninit(); BLOCK];
|
||||
|
||||
// The current block on the right side (from `r.sub(block_r)` to `r`).
|
||||
// SAFETY: The documentation for .add() specifically mention that `vec.as_ptr().add(vec.len())` is always safe
|
||||
let mut r = unsafe { l.add(v.len()) };
|
||||
let mut block_r = BLOCK;
|
||||
let mut start_r = ptr::null_mut();
|
||||
let mut end_r = ptr::null_mut();
|
||||
let mut offsets_r = [MaybeUninit::<u8>::uninit(); BLOCK];
|
||||
|
||||
// FIXME: When we get VLAs, try creating one array of length `min(v.len(), 2 * BLOCK)` rather
|
||||
// than two fixed-size arrays of length `BLOCK`. VLAs might be more cache-efficient.
|
||||
|
||||
// Returns the number of elements between pointers `l` (inclusive) and `r` (exclusive).
|
||||
fn width<T>(l: *mut T, r: *mut T) -> usize {
|
||||
assert!(mem::size_of::<T>() > 0);
|
||||
// FIXME: this should *likely* use `offset_from`, but more
|
||||
// investigation is needed (including running tests in miri).
|
||||
(r.addr() - l.addr()) / mem::size_of::<T>()
|
||||
}
|
||||
|
||||
loop {
|
||||
// We are done with partitioning block-by-block when `l` and `r` get very close. Then we do
|
||||
// some patch-up work in order to partition the remaining elements in between.
|
||||
let is_done = width(l, r) <= 2 * BLOCK;
|
||||
|
||||
if is_done {
|
||||
// Number of remaining elements (still not compared to the pivot).
|
||||
let mut rem = width(l, r);
|
||||
if start_l < end_l || start_r < end_r {
|
||||
rem -= BLOCK;
|
||||
}
|
||||
|
||||
// Adjust block sizes so that the left and right block don't overlap, but get perfectly
|
||||
// aligned to cover the whole remaining gap.
|
||||
if start_l < end_l {
|
||||
block_r = rem;
|
||||
} else if start_r < end_r {
|
||||
block_l = rem;
|
||||
} else {
|
||||
// There were the same number of elements to switch on both blocks during the last
|
||||
// iteration, so there are no remaining elements on either block. Cover the remaining
|
||||
// items with roughly equally-sized blocks.
|
||||
block_l = rem / 2;
|
||||
block_r = rem - block_l;
|
||||
}
|
||||
debug_assert!(block_l <= BLOCK && block_r <= BLOCK);
|
||||
debug_assert!(width(l, r) == block_l + block_r);
|
||||
}
|
||||
|
||||
if start_l == end_l {
|
||||
// Trace `block_l` elements from the left side.
|
||||
start_l = MaybeUninit::slice_as_mut_ptr(&mut offsets_l);
|
||||
end_l = start_l;
|
||||
let mut elem = l;
|
||||
|
||||
for i in 0..block_l {
|
||||
// SAFETY: The unsafety operations below involve the usage of the `offset`.
|
||||
// According to the conditions required by the function, we satisfy them because:
|
||||
// 1. `offsets_l` is stack-allocated, and thus considered separate allocated object.
|
||||
// 2. The function `is_less` returns a `bool`.
|
||||
// Casting a `bool` will never overflow `isize`.
|
||||
// 3. We have guaranteed that `block_l` will be `<= BLOCK`.
|
||||
// Plus, `end_l` was initially set to the begin pointer of `offsets_` which was declared on the stack.
|
||||
// Thus, we know that even in the worst case (all invocations of `is_less` returns false) we will only be at most 1 byte pass the end.
|
||||
// Another unsafety operation here is dereferencing `elem`.
|
||||
// However, `elem` was initially the begin pointer to the slice which is always valid.
|
||||
unsafe {
|
||||
// Branchless comparison.
|
||||
*end_l = i as u8;
|
||||
end_l = end_l.add(!is_less(&*elem, pivot) as usize);
|
||||
elem = elem.add(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if start_r == end_r {
|
||||
// Trace `block_r` elements from the right side.
|
||||
start_r = MaybeUninit::slice_as_mut_ptr(&mut offsets_r);
|
||||
end_r = start_r;
|
||||
let mut elem = r;
|
||||
|
||||
for i in 0..block_r {
|
||||
// SAFETY: The unsafety operations below involve the usage of the `offset`.
|
||||
// According to the conditions required by the function, we satisfy them because:
|
||||
// 1. `offsets_r` is stack-allocated, and thus considered separate allocated object.
|
||||
// 2. The function `is_less` returns a `bool`.
|
||||
// Casting a `bool` will never overflow `isize`.
|
||||
// 3. We have guaranteed that `block_r` will be `<= BLOCK`.
|
||||
// Plus, `end_r` was initially set to the begin pointer of `offsets_` which was declared on the stack.
|
||||
// Thus, we know that even in the worst case (all invocations of `is_less` returns true) we will only be at most 1 byte pass the end.
|
||||
// Another unsafety operation here is dereferencing `elem`.
|
||||
// However, `elem` was initially `1 * sizeof(T)` past the end and we decrement it by `1 * sizeof(T)` before accessing it.
|
||||
// Plus, `block_r` was asserted to be less than `BLOCK` and `elem` will therefore at most be pointing to the beginning of the slice.
|
||||
unsafe {
|
||||
// Branchless comparison.
|
||||
elem = elem.sub(1);
|
||||
*end_r = i as u8;
|
||||
end_r = end_r.add(is_less(&*elem, pivot) as usize);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Number of out-of-order elements to swap between the left and right side.
|
||||
let count = cmp::min(width(start_l, end_l), width(start_r, end_r));
|
||||
|
||||
if count > 0 {
|
||||
macro_rules! left {
|
||||
() => {
|
||||
l.add(usize::from(*start_l))
|
||||
};
|
||||
}
|
||||
macro_rules! right {
|
||||
() => {
|
||||
r.sub(usize::from(*start_r) + 1)
|
||||
};
|
||||
}
|
||||
|
||||
// Instead of swapping one pair at the time, it is more efficient to perform a cyclic
|
||||
// permutation. This is not strictly equivalent to swapping, but produces a similar
|
||||
// result using fewer memory operations.
|
||||
|
||||
// SAFETY: The use of `ptr::read` is valid because there is at least one element in
|
||||
// both `offsets_l` and `offsets_r`, so `left!` is a valid pointer to read from.
|
||||
//
|
||||
// The uses of `left!` involve calls to `offset` on `l`, which points to the
|
||||
// beginning of `v`. All the offsets pointed-to by `start_l` are at most `block_l`, so
|
||||
// these `offset` calls are safe as all reads are within the block. The same argument
|
||||
// applies for the uses of `right!`.
|
||||
//
|
||||
// The calls to `start_l.offset` are valid because there are at most `count-1` of them,
|
||||
// plus the final one at the end of the unsafe block, where `count` is the minimum number
|
||||
// of collected offsets in `offsets_l` and `offsets_r`, so there is no risk of there not
|
||||
// being enough elements. The same reasoning applies to the calls to `start_r.offset`.
|
||||
//
|
||||
// The calls to `copy_nonoverlapping` are safe because `left!` and `right!` are guaranteed
|
||||
// not to overlap, and are valid because of the reasoning above.
|
||||
unsafe {
|
||||
let tmp = ptr::read(left!());
|
||||
ptr::copy_nonoverlapping(right!(), left!(), 1);
|
||||
|
||||
for _ in 1..count {
|
||||
start_l = start_l.add(1);
|
||||
ptr::copy_nonoverlapping(left!(), right!(), 1);
|
||||
start_r = start_r.add(1);
|
||||
ptr::copy_nonoverlapping(right!(), left!(), 1);
|
||||
}
|
||||
|
||||
ptr::copy_nonoverlapping(&tmp, right!(), 1);
|
||||
mem::forget(tmp);
|
||||
start_l = start_l.add(1);
|
||||
start_r = start_r.add(1);
|
||||
}
|
||||
}
|
||||
|
||||
if start_l == end_l {
|
||||
// All out-of-order elements in the left block were moved. Move to the next block.
|
||||
|
||||
// block-width-guarantee
|
||||
// SAFETY: if `!is_done` then the slice width is guaranteed to be at least `2*BLOCK` wide. There
|
||||
// are at most `BLOCK` elements in `offsets_l` because of its size, so the `offset` operation is
|
||||
// safe. Otherwise, the debug assertions in the `is_done` case guarantee that
|
||||
// `width(l, r) == block_l + block_r`, namely, that the block sizes have been adjusted to account
|
||||
// for the smaller number of remaining elements.
|
||||
l = unsafe { l.add(block_l) };
|
||||
}
|
||||
|
||||
if start_r == end_r {
|
||||
// All out-of-order elements in the right block were moved. Move to the previous block.
|
||||
|
||||
// SAFETY: Same argument as [block-width-guarantee]. Either this is a full block `2*BLOCK`-wide,
|
||||
// or `block_r` has been adjusted for the last handful of elements.
|
||||
r = unsafe { r.sub(block_r) };
|
||||
}
|
||||
|
||||
if is_done {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// All that remains now is at most one block (either the left or the right) with out-of-order
|
||||
// elements that need to be moved. Such remaining elements can be simply shifted to the end
|
||||
// within their block.
|
||||
|
||||
if start_l < end_l {
|
||||
// The left block remains.
|
||||
// Move its remaining out-of-order elements to the far right.
|
||||
debug_assert_eq!(width(l, r), block_l);
|
||||
while start_l < end_l {
|
||||
// remaining-elements-safety
|
||||
// SAFETY: while the loop condition holds there are still elements in `offsets_l`, so it
|
||||
// is safe to point `end_l` to the previous element.
|
||||
//
|
||||
// The `ptr::swap` is safe if both its arguments are valid for reads and writes:
|
||||
// - Per the debug assert above, the distance between `l` and `r` is `block_l`
|
||||
// elements, so there can be at most `block_l` remaining offsets between `start_l`
|
||||
// and `end_l`. This means `r` will be moved at most `block_l` steps back, which
|
||||
// makes the `r.offset` calls valid (at that point `l == r`).
|
||||
// - `offsets_l` contains valid offsets into `v` collected during the partitioning of
|
||||
// the last block, so the `l.offset` calls are valid.
|
||||
unsafe {
|
||||
end_l = end_l.sub(1);
|
||||
ptr::swap(l.add(usize::from(*end_l)), r.sub(1));
|
||||
r = r.sub(1);
|
||||
}
|
||||
}
|
||||
width(v.as_mut_ptr(), r)
|
||||
} else if start_r < end_r {
|
||||
// The right block remains.
|
||||
// Move its remaining out-of-order elements to the far left.
|
||||
debug_assert_eq!(width(l, r), block_r);
|
||||
while start_r < end_r {
|
||||
// SAFETY: See the reasoning in [remaining-elements-safety].
|
||||
unsafe {
|
||||
end_r = end_r.sub(1);
|
||||
ptr::swap(l, r.sub(usize::from(*end_r) + 1));
|
||||
l = l.add(1);
|
||||
}
|
||||
}
|
||||
width(v.as_mut_ptr(), l)
|
||||
} else {
|
||||
// Nothing else to do, we're done.
|
||||
width(v.as_mut_ptr(), l)
|
||||
}
|
||||
}
|
||||
|
||||
/// Partitions `v` into elements smaller than `v[pivot]`, followed by elements greater than or
|
||||
/// equal to `v[pivot]`.
|
||||
///
|
||||
/// Returns a tuple of:
|
||||
///
|
||||
/// 1. Number of elements smaller than `v[pivot]`.
|
||||
/// 2. True if `v` was already partitioned.
|
||||
pub(super) fn partition<T, F>(v: &mut [T], pivot: usize, is_less: &mut F) -> (usize, bool)
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
let (mid, was_partitioned) = {
|
||||
// Place the pivot at the beginning of slice.
|
||||
v.swap(0, pivot);
|
||||
let (pivot, v) = v.split_at_mut(1);
|
||||
let pivot = &mut pivot[0];
|
||||
|
||||
// Read the pivot into a stack-allocated variable for efficiency. If a following comparison
|
||||
// operation panics, the pivot will be automatically written back into the slice.
|
||||
|
||||
// SAFETY: `pivot` is a reference to the first element of `v`, so `ptr::read` is safe.
|
||||
let tmp = mem::ManuallyDrop::new(unsafe { ptr::read(pivot) });
|
||||
let _pivot_guard = InsertionHole { src: &*tmp, dest: pivot };
|
||||
let pivot = &*tmp;
|
||||
|
||||
// Find the first pair of out-of-order elements.
|
||||
let mut l = 0;
|
||||
let mut r = v.len();
|
||||
|
||||
// SAFETY: The unsafety below involves indexing an array.
|
||||
// For the first one: We already do the bounds checking here with `l < r`.
|
||||
// For the second one: We initially have `l == 0` and `r == v.len()` and we checked that `l < r` at every indexing operation.
|
||||
// From here we know that `r` must be at least `r == l` which was shown to be valid from the first one.
|
||||
unsafe {
|
||||
// Find the first element greater than or equal to the pivot.
|
||||
while l < r && is_less(v.get_unchecked(l), pivot) {
|
||||
l += 1;
|
||||
}
|
||||
|
||||
// Find the last element smaller that the pivot.
|
||||
while l < r && !is_less(v.get_unchecked(r - 1), pivot) {
|
||||
r -= 1;
|
||||
}
|
||||
}
|
||||
|
||||
(l + partition_in_blocks(&mut v[l..r], pivot, is_less), l >= r)
|
||||
|
||||
// `_pivot_guard` goes out of scope and writes the pivot (which is a stack-allocated
|
||||
// variable) back into the slice where it originally was. This step is critical in ensuring
|
||||
// safety!
|
||||
};
|
||||
|
||||
// Place the pivot between the two partitions.
|
||||
v.swap(0, mid);
|
||||
|
||||
(mid, was_partitioned)
|
||||
}
|
||||
|
||||
/// Partitions `v` into elements equal to `v[pivot]` followed by elements greater than `v[pivot]`.
|
||||
///
|
||||
/// Returns the number of elements equal to the pivot. It is assumed that `v` does not contain
|
||||
/// elements smaller than the pivot.
|
||||
pub(super) fn partition_equal<T, F>(v: &mut [T], pivot: usize, is_less: &mut F) -> usize
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
// Place the pivot at the beginning of slice.
|
||||
v.swap(0, pivot);
|
||||
let (pivot, v) = v.split_at_mut(1);
|
||||
let pivot = &mut pivot[0];
|
||||
|
||||
// Read the pivot into a stack-allocated variable for efficiency. If a following comparison
|
||||
// operation panics, the pivot will be automatically written back into the slice.
|
||||
// SAFETY: The pointer here is valid because it is obtained from a reference to a slice.
|
||||
let tmp = mem::ManuallyDrop::new(unsafe { ptr::read(pivot) });
|
||||
let _pivot_guard = InsertionHole { src: &*tmp, dest: pivot };
|
||||
let pivot = &*tmp;
|
||||
|
||||
let len = v.len();
|
||||
if len == 0 {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Now partition the slice.
|
||||
let mut l = 0;
|
||||
let mut r = len;
|
||||
loop {
|
||||
// SAFETY: The unsafety below involves indexing an array.
|
||||
// For the first one: We already do the bounds checking here with `l < r`.
|
||||
// For the second one: We initially have `l == 0` and `r == v.len()` and we checked that `l < r` at every indexing operation.
|
||||
// From here we know that `r` must be at least `r == l` which was shown to be valid from the first one.
|
||||
unsafe {
|
||||
// Find the first element greater than the pivot.
|
||||
while l < r && !is_less(pivot, v.get_unchecked(l)) {
|
||||
l += 1;
|
||||
}
|
||||
|
||||
// Find the last element equal to the pivot.
|
||||
loop {
|
||||
r -= 1;
|
||||
if l >= r || !is_less(pivot, v.get_unchecked(r)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Are we done?
|
||||
if l >= r {
|
||||
break;
|
||||
}
|
||||
|
||||
// Swap the found pair of out-of-order elements.
|
||||
let ptr = v.as_mut_ptr();
|
||||
ptr::swap(ptr.add(l), ptr.add(r));
|
||||
l += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// We found `l` elements equal to the pivot. Add 1 to account for the pivot itself.
|
||||
l + 1
|
||||
|
||||
// `_pivot_guard` goes out of scope and writes the pivot (which is a stack-allocated variable)
|
||||
// back into the slice where it originally was. This step is critical in ensuring safety!
|
||||
}
|
||||
|
||||
/// Scatters some elements around in an attempt to break patterns that might cause imbalanced
|
||||
/// partitions in quicksort.
|
||||
#[cold]
|
||||
pub(super) fn break_patterns<T>(v: &mut [T]) {
|
||||
let len = v.len();
|
||||
if len >= 8 {
|
||||
let mut seed = len;
|
||||
let mut gen_usize = || {
|
||||
// Pseudorandom number generator from the "Xorshift RNGs" paper by George Marsaglia.
|
||||
if usize::BITS <= 32 {
|
||||
let mut r = seed as u32;
|
||||
r ^= r << 13;
|
||||
r ^= r >> 17;
|
||||
r ^= r << 5;
|
||||
seed = r as usize;
|
||||
seed
|
||||
} else {
|
||||
let mut r = seed as u64;
|
||||
r ^= r << 13;
|
||||
r ^= r >> 7;
|
||||
r ^= r << 17;
|
||||
seed = r as usize;
|
||||
seed
|
||||
}
|
||||
};
|
||||
|
||||
// Take random numbers modulo this number.
|
||||
// The number fits into `usize` because `len` is not greater than `isize::MAX`.
|
||||
let modulus = len.next_power_of_two();
|
||||
|
||||
// Some pivot candidates will be in the nearby of this index. Let's randomize them.
|
||||
let pos = len / 4 * 2;
|
||||
|
||||
for i in 0..3 {
|
||||
// Generate a random number modulo `len`. However, in order to avoid costly operations
|
||||
// we first take it modulo a power of two, and then decrease by `len` until it fits
|
||||
// into the range `[0, len - 1]`.
|
||||
let mut other = gen_usize() & (modulus - 1);
|
||||
|
||||
// `other` is guaranteed to be less than `2 * len`.
|
||||
if other >= len {
|
||||
other -= len;
|
||||
}
|
||||
|
||||
v.swap(pos - 1 + i, other);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Chooses a pivot in `v` and returns the index and `true` if the slice is likely already sorted.
|
||||
///
|
||||
/// Elements in `v` might be reordered in the process.
|
||||
pub(super) fn choose_pivot<T, F>(v: &mut [T], is_less: &mut F) -> (usize, bool)
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
// Minimum length to choose the median-of-medians method.
|
||||
// Shorter slices use the simple median-of-three method.
|
||||
const SHORTEST_MEDIAN_OF_MEDIANS: usize = 50;
|
||||
// Maximum number of swaps that can be performed in this function.
|
||||
const MAX_SWAPS: usize = 4 * 3;
|
||||
|
||||
let len = v.len();
|
||||
|
||||
// Three indices near which we are going to choose a pivot.
|
||||
let mut a = len / 4 * 1;
|
||||
let mut b = len / 4 * 2;
|
||||
let mut c = len / 4 * 3;
|
||||
|
||||
// Counts the total number of swaps we are about to perform while sorting indices.
|
||||
let mut swaps = 0;
|
||||
|
||||
if len >= 8 {
|
||||
// Swaps indices so that `v[a] <= v[b]`.
|
||||
// SAFETY: `len >= 8` so there are at least two elements in the neighborhoods of
|
||||
// `a`, `b` and `c`. This means the three calls to `sort_adjacent` result in
|
||||
// corresponding calls to `sort3` with valid 3-item neighborhoods around each
|
||||
// pointer, which in turn means the calls to `sort2` are done with valid
|
||||
// references. Thus the `v.get_unchecked` calls are safe, as is the `ptr::swap`
|
||||
// call.
|
||||
let mut sort2 = |a: &mut usize, b: &mut usize| unsafe {
|
||||
if is_less(v.get_unchecked(*b), v.get_unchecked(*a)) {
|
||||
ptr::swap(a, b);
|
||||
swaps += 1;
|
||||
}
|
||||
};
|
||||
|
||||
// Swaps indices so that `v[a] <= v[b] <= v[c]`.
|
||||
let mut sort3 = |a: &mut usize, b: &mut usize, c: &mut usize| {
|
||||
sort2(a, b);
|
||||
sort2(b, c);
|
||||
sort2(a, b);
|
||||
};
|
||||
|
||||
if len >= SHORTEST_MEDIAN_OF_MEDIANS {
|
||||
// Finds the median of `v[a - 1], v[a], v[a + 1]` and stores the index into `a`.
|
||||
let mut sort_adjacent = |a: &mut usize| {
|
||||
let tmp = *a;
|
||||
sort3(&mut (tmp - 1), a, &mut (tmp + 1));
|
||||
};
|
||||
|
||||
// Find medians in the neighborhoods of `a`, `b`, and `c`.
|
||||
sort_adjacent(&mut a);
|
||||
sort_adjacent(&mut b);
|
||||
sort_adjacent(&mut c);
|
||||
}
|
||||
|
||||
// Find the median among `a`, `b`, and `c`.
|
||||
sort3(&mut a, &mut b, &mut c);
|
||||
}
|
||||
|
||||
if swaps < MAX_SWAPS {
|
||||
(b, swaps == 0)
|
||||
} else {
|
||||
// The maximum number of swaps was performed. Chances are the slice is descending or mostly
|
||||
// descending, so reversing will probably help sort it faster.
|
||||
v.reverse();
|
||||
(len - 1 - b, true)
|
||||
}
|
||||
}
|
||||
|
||||
/// Sorts `v` recursively.
|
||||
///
|
||||
/// If the slice had a predecessor in the original array, it is specified as `pred`.
|
||||
///
|
||||
/// `limit` is the number of allowed imbalanced partitions before switching to `heapsort`. If zero,
|
||||
/// this function will immediately switch to heapsort.
|
||||
fn recurse<'a, T, F>(mut v: &'a mut [T], is_less: &mut F, mut pred: Option<&'a T>, mut limit: u32)
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
// Slices of up to this length get sorted using insertion sort.
|
||||
const MAX_INSERTION: usize = 20;
|
||||
|
||||
// True if the last partitioning was reasonably balanced.
|
||||
let mut was_balanced = true;
|
||||
// True if the last partitioning didn't shuffle elements (the slice was already partitioned).
|
||||
let mut was_partitioned = true;
|
||||
|
||||
loop {
|
||||
let len = v.len();
|
||||
|
||||
// Very short slices get sorted using insertion sort.
|
||||
if len <= MAX_INSERTION {
|
||||
if len >= 2 {
|
||||
insertion_sort_shift_left(v, 1, is_less);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// If too many bad pivot choices were made, simply fall back to heapsort in order to
|
||||
// guarantee `O(n * log(n))` worst-case.
|
||||
if limit == 0 {
|
||||
heapsort(v, is_less);
|
||||
return;
|
||||
}
|
||||
|
||||
// If the last partitioning was imbalanced, try breaking patterns in the slice by shuffling
|
||||
// some elements around. Hopefully we'll choose a better pivot this time.
|
||||
if !was_balanced {
|
||||
break_patterns(v);
|
||||
limit -= 1;
|
||||
}
|
||||
|
||||
// Choose a pivot and try guessing whether the slice is already sorted.
|
||||
let (pivot, likely_sorted) = choose_pivot(v, is_less);
|
||||
|
||||
// If the last partitioning was decently balanced and didn't shuffle elements, and if pivot
|
||||
// selection predicts the slice is likely already sorted...
|
||||
if was_balanced && was_partitioned && likely_sorted {
|
||||
// Try identifying several out-of-order elements and shifting them to correct
|
||||
// positions. If the slice ends up being completely sorted, we're done.
|
||||
if partial_insertion_sort(v, is_less) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// If the chosen pivot is equal to the predecessor, then it's the smallest element in the
|
||||
// slice. Partition the slice into elements equal to and elements greater than the pivot.
|
||||
// This case is usually hit when the slice contains many duplicate elements.
|
||||
if let Some(p) = pred {
|
||||
if !is_less(p, &v[pivot]) {
|
||||
let mid = partition_equal(v, pivot, is_less);
|
||||
|
||||
// Continue sorting elements greater than the pivot.
|
||||
v = &mut v[mid..];
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Partition the slice.
|
||||
let (mid, was_p) = partition(v, pivot, is_less);
|
||||
was_balanced = cmp::min(mid, len - mid) >= len / 8;
|
||||
was_partitioned = was_p;
|
||||
|
||||
// Split the slice into `left`, `pivot`, and `right`.
|
||||
let (left, right) = v.split_at_mut(mid);
|
||||
let (pivot, right) = right.split_at_mut(1);
|
||||
let pivot = &pivot[0];
|
||||
|
||||
// Recurse into the shorter side only in order to minimize the total number of recursive
|
||||
// calls and consume less stack space. Then just continue with the longer side (this is
|
||||
// akin to tail recursion).
|
||||
if left.len() < right.len() {
|
||||
recurse(left, is_less, pred, limit);
|
||||
v = right;
|
||||
pred = Some(pivot);
|
||||
} else {
|
||||
recurse(right, is_less, Some(pivot), limit);
|
||||
v = left;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Sorts `v` using pattern-defeating quicksort, which is *O*(*n* \* log(*n*)) worst-case.
|
||||
pub fn quicksort<T, F>(v: &mut [T], mut is_less: F)
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
// Sorting has no meaningful behavior on zero-sized types.
|
||||
if T::IS_ZST {
|
||||
return;
|
||||
}
|
||||
|
||||
// Limit the number of imbalanced partitions to `floor(log2(len)) + 1`.
|
||||
let limit = usize::BITS - v.len().leading_zeros();
|
||||
|
||||
recurse(v, &mut is_less, None, limit);
|
||||
}
|
||||
|
||||
/// Merges non-decreasing runs `v[..mid]` and `v[mid..]` using `buf` as temporary storage, and
|
||||
/// stores the result into `v[..]`.
|
||||
///
|
||||
/// # Safety
|
||||
///
|
||||
/// The two slices must be non-empty and `mid` must be in bounds. Buffer `buf` must be long enough
|
||||
/// to hold a copy of the shorter slice. Also, `T` must not be a zero-sized type.
|
||||
unsafe fn merge<T, F>(v: &mut [T], mid: usize, buf: *mut T, is_less: &mut F)
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
let len = v.len();
|
||||
let v = v.as_mut_ptr();
|
||||
|
||||
// SAFETY: mid and len must be in-bounds of v.
|
||||
let (v_mid, v_end) = unsafe { (v.add(mid), v.add(len)) };
|
||||
|
||||
// The merge process first copies the shorter run into `buf`. Then it traces the newly copied
|
||||
// run and the longer run forwards (or backwards), comparing their next unconsumed elements and
|
||||
// copying the lesser (or greater) one into `v`.
|
||||
//
|
||||
// As soon as the shorter run is fully consumed, the process is done. If the longer run gets
|
||||
// consumed first, then we must copy whatever is left of the shorter run into the remaining
|
||||
// hole in `v`.
|
||||
//
|
||||
// Intermediate state of the process is always tracked by `hole`, which serves two purposes:
|
||||
// 1. Protects integrity of `v` from panics in `is_less`.
|
||||
// 2. Fills the remaining hole in `v` if the longer run gets consumed first.
|
||||
//
|
||||
// Panic safety:
|
||||
//
|
||||
// If `is_less` panics at any point during the process, `hole` will get dropped and fill the
|
||||
// hole in `v` with the unconsumed range in `buf`, thus ensuring that `v` still holds every
|
||||
// object it initially held exactly once.
|
||||
let mut hole;
|
||||
|
||||
if mid <= len - mid {
|
||||
// The left run is shorter.
|
||||
|
||||
// SAFETY: buf must have enough capacity for `v[..mid]`.
|
||||
unsafe {
|
||||
ptr::copy_nonoverlapping(v, buf, mid);
|
||||
hole = MergeHole { start: buf, end: buf.add(mid), dest: v };
|
||||
}
|
||||
|
||||
// Initially, these pointers point to the beginnings of their arrays.
|
||||
let left = &mut hole.start;
|
||||
let mut right = v_mid;
|
||||
let out = &mut hole.dest;
|
||||
|
||||
while *left < hole.end && right < v_end {
|
||||
// Consume the lesser side.
|
||||
// If equal, prefer the left run to maintain stability.
|
||||
|
||||
// SAFETY: left and right must be valid and part of v same for out.
|
||||
unsafe {
|
||||
let is_l = is_less(&*right, &**left);
|
||||
let to_copy = if is_l { right } else { *left };
|
||||
ptr::copy_nonoverlapping(to_copy, *out, 1);
|
||||
*out = out.add(1);
|
||||
right = right.add(is_l as usize);
|
||||
*left = left.add(!is_l as usize);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// The right run is shorter.
|
||||
|
||||
// SAFETY: buf must have enough capacity for `v[mid..]`.
|
||||
unsafe {
|
||||
ptr::copy_nonoverlapping(v_mid, buf, len - mid);
|
||||
hole = MergeHole { start: buf, end: buf.add(len - mid), dest: v_mid };
|
||||
}
|
||||
|
||||
// Initially, these pointers point past the ends of their arrays.
|
||||
let left = &mut hole.dest;
|
||||
let right = &mut hole.end;
|
||||
let mut out = v_end;
|
||||
|
||||
while v < *left && buf < *right {
|
||||
// Consume the greater side.
|
||||
// If equal, prefer the right run to maintain stability.
|
||||
|
||||
// SAFETY: left and right must be valid and part of v same for out.
|
||||
unsafe {
|
||||
let is_l = is_less(&*right.sub(1), &*left.sub(1));
|
||||
*left = left.sub(is_l as usize);
|
||||
*right = right.sub(!is_l as usize);
|
||||
let to_copy = if is_l { *left } else { *right };
|
||||
out = out.sub(1);
|
||||
ptr::copy_nonoverlapping(to_copy, out, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
// Finally, `hole` gets dropped. If the shorter run was not fully consumed, whatever remains of
|
||||
// it will now be copied into the hole in `v`.
|
||||
|
||||
// When dropped, copies the range `start..end` into `dest..`.
|
||||
struct MergeHole<T> {
|
||||
start: *mut T,
|
||||
end: *mut T,
|
||||
dest: *mut T,
|
||||
}
|
||||
|
||||
impl<T> Drop for MergeHole<T> {
|
||||
fn drop(&mut self) {
|
||||
// SAFETY: `T` is not a zero-sized type, and these are pointers into a slice's elements.
|
||||
unsafe {
|
||||
let len = self.end.sub_ptr(self.start);
|
||||
ptr::copy_nonoverlapping(self.start, self.dest, len);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// This merge sort borrows some (but not all) ideas from TimSort, which used to be described in
|
||||
/// detail [here](https://github.com/python/cpython/blob/main/Objects/listsort.txt). However Python
|
||||
/// has switched to a Powersort based implementation.
|
||||
///
|
||||
/// The algorithm identifies strictly descending and non-descending subsequences, which are called
|
||||
/// natural runs. There is a stack of pending runs yet to be merged. Each newly found run is pushed
|
||||
/// onto the stack, and then some pairs of adjacent runs are merged until these two invariants are
|
||||
/// satisfied:
|
||||
///
|
||||
/// 1. for every `i` in `1..runs.len()`: `runs[i - 1].len > runs[i].len`
|
||||
/// 2. for every `i` in `2..runs.len()`: `runs[i - 2].len > runs[i - 1].len + runs[i].len`
|
||||
///
|
||||
/// The invariants ensure that the total running time is *O*(*n* \* log(*n*)) worst-case.
|
||||
pub fn merge_sort<T, CmpF, ElemAllocF, ElemDeallocF, RunAllocF, RunDeallocF>(
|
||||
v: &mut [T],
|
||||
is_less: &mut CmpF,
|
||||
elem_alloc_fn: ElemAllocF,
|
||||
elem_dealloc_fn: ElemDeallocF,
|
||||
run_alloc_fn: RunAllocF,
|
||||
run_dealloc_fn: RunDeallocF,
|
||||
) where
|
||||
CmpF: FnMut(&T, &T) -> bool,
|
||||
ElemAllocF: Fn(usize) -> *mut T,
|
||||
ElemDeallocF: Fn(*mut T, usize),
|
||||
RunAllocF: Fn(usize) -> *mut TimSortRun,
|
||||
RunDeallocF: Fn(*mut TimSortRun, usize),
|
||||
{
|
||||
// Slices of up to this length get sorted using insertion sort.
|
||||
const MAX_INSERTION: usize = 20;
|
||||
|
||||
// The caller should have already checked that.
|
||||
debug_assert!(!T::IS_ZST);
|
||||
|
||||
let len = v.len();
|
||||
|
||||
// Short arrays get sorted in-place via insertion sort to avoid allocations.
|
||||
if len <= MAX_INSERTION {
|
||||
if len >= 2 {
|
||||
insertion_sort_shift_left(v, 1, is_less);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// Allocate a buffer to use as scratch memory. We keep the length 0 so we can keep in it
|
||||
// shallow copies of the contents of `v` without risking the dtors running on copies if
|
||||
// `is_less` panics. When merging two sorted runs, this buffer holds a copy of the shorter run,
|
||||
// which will always have length at most `len / 2`.
|
||||
let buf = BufGuard::new(len / 2, elem_alloc_fn, elem_dealloc_fn);
|
||||
let buf_ptr = buf.buf_ptr.as_ptr();
|
||||
|
||||
let mut runs = RunVec::new(run_alloc_fn, run_dealloc_fn);
|
||||
|
||||
let mut end = 0;
|
||||
let mut start = 0;
|
||||
|
||||
// Scan forward. Memory pre-fetching prefers forward scanning vs backwards scanning, and the
|
||||
// code-gen is usually better. For the most sensitive types such as integers, these are merged
|
||||
// bidirectionally at once. So there is no benefit in scanning backwards.
|
||||
while end < len {
|
||||
let (streak_end, was_reversed) = find_streak(&v[start..], is_less);
|
||||
end += streak_end;
|
||||
if was_reversed {
|
||||
v[start..end].reverse();
|
||||
}
|
||||
|
||||
// Insert some more elements into the run if it's too short. Insertion sort is faster than
|
||||
// merge sort on short sequences, so this significantly improves performance.
|
||||
end = provide_sorted_batch(v, start, end, is_less);
|
||||
|
||||
// Push this run onto the stack.
|
||||
runs.push(TimSortRun { start, len: end - start });
|
||||
start = end;
|
||||
|
||||
// Merge some pairs of adjacent runs to satisfy the invariants.
|
||||
while let Some(r) = collapse(runs.as_slice(), len) {
|
||||
let left = runs[r];
|
||||
let right = runs[r + 1];
|
||||
let merge_slice = &mut v[left.start..right.start + right.len];
|
||||
// SAFETY: `buf_ptr` must hold enough capacity for the shorter of the two sides, and
|
||||
// neither side may be on length 0.
|
||||
unsafe {
|
||||
merge(merge_slice, left.len, buf_ptr, is_less);
|
||||
}
|
||||
runs[r + 1] = TimSortRun { start: left.start, len: left.len + right.len };
|
||||
runs.remove(r);
|
||||
}
|
||||
}
|
||||
|
||||
// Finally, exactly one run must remain in the stack.
|
||||
debug_assert!(runs.len() == 1 && runs[0].start == 0 && runs[0].len == len);
|
||||
|
||||
// Examines the stack of runs and identifies the next pair of runs to merge. More specifically,
|
||||
// if `Some(r)` is returned, that means `runs[r]` and `runs[r + 1]` must be merged next. If the
|
||||
// algorithm should continue building a new run instead, `None` is returned.
|
||||
//
|
||||
// TimSort is infamous for its buggy implementations, as described here:
|
||||
// http://envisage-project.eu/timsort-specification-and-verification/
|
||||
//
|
||||
// The gist of the story is: we must enforce the invariants on the top four runs on the stack.
|
||||
// Enforcing them on just top three is not sufficient to ensure that the invariants will still
|
||||
// hold for *all* runs in the stack.
|
||||
//
|
||||
// This function correctly checks invariants for the top four runs. Additionally, if the top
|
||||
// run starts at index 0, it will always demand a merge operation until the stack is fully
|
||||
// collapsed, in order to complete the sort.
|
||||
#[inline]
|
||||
fn collapse(runs: &[TimSortRun], stop: usize) -> Option<usize> {
|
||||
let n = runs.len();
|
||||
if n >= 2
|
||||
&& (runs[n - 1].start + runs[n - 1].len == stop
|
||||
|| runs[n - 2].len <= runs[n - 1].len
|
||||
|| (n >= 3 && runs[n - 3].len <= runs[n - 2].len + runs[n - 1].len)
|
||||
|| (n >= 4 && runs[n - 4].len <= runs[n - 3].len + runs[n - 2].len))
|
||||
{
|
||||
if n >= 3 && runs[n - 3].len < runs[n - 1].len { Some(n - 3) } else { Some(n - 2) }
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
// Extremely basic versions of Vec.
|
||||
// Their use is super limited and by having the code here, it allows reuse between the sort
|
||||
// implementations.
|
||||
struct BufGuard<T, ElemDeallocF>
|
||||
where
|
||||
ElemDeallocF: Fn(*mut T, usize),
|
||||
{
|
||||
buf_ptr: ptr::NonNull<T>,
|
||||
capacity: usize,
|
||||
elem_dealloc_fn: ElemDeallocF,
|
||||
}
|
||||
|
||||
impl<T, ElemDeallocF> BufGuard<T, ElemDeallocF>
|
||||
where
|
||||
ElemDeallocF: Fn(*mut T, usize),
|
||||
{
|
||||
fn new<ElemAllocF>(
|
||||
len: usize,
|
||||
elem_alloc_fn: ElemAllocF,
|
||||
elem_dealloc_fn: ElemDeallocF,
|
||||
) -> Self
|
||||
where
|
||||
ElemAllocF: Fn(usize) -> *mut T,
|
||||
{
|
||||
Self {
|
||||
buf_ptr: ptr::NonNull::new(elem_alloc_fn(len)).unwrap(),
|
||||
capacity: len,
|
||||
elem_dealloc_fn,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T, ElemDeallocF> Drop for BufGuard<T, ElemDeallocF>
|
||||
where
|
||||
ElemDeallocF: Fn(*mut T, usize),
|
||||
{
|
||||
fn drop(&mut self) {
|
||||
(self.elem_dealloc_fn)(self.buf_ptr.as_ptr(), self.capacity);
|
||||
}
|
||||
}
|
||||
|
||||
struct RunVec<RunAllocF, RunDeallocF>
|
||||
where
|
||||
RunAllocF: Fn(usize) -> *mut TimSortRun,
|
||||
RunDeallocF: Fn(*mut TimSortRun, usize),
|
||||
{
|
||||
buf_ptr: ptr::NonNull<TimSortRun>,
|
||||
capacity: usize,
|
||||
len: usize,
|
||||
run_alloc_fn: RunAllocF,
|
||||
run_dealloc_fn: RunDeallocF,
|
||||
}
|
||||
|
||||
impl<RunAllocF, RunDeallocF> RunVec<RunAllocF, RunDeallocF>
|
||||
where
|
||||
RunAllocF: Fn(usize) -> *mut TimSortRun,
|
||||
RunDeallocF: Fn(*mut TimSortRun, usize),
|
||||
{
|
||||
fn new(run_alloc_fn: RunAllocF, run_dealloc_fn: RunDeallocF) -> Self {
|
||||
// Most slices can be sorted with at most 16 runs in-flight.
|
||||
const START_RUN_CAPACITY: usize = 16;
|
||||
|
||||
Self {
|
||||
buf_ptr: ptr::NonNull::new(run_alloc_fn(START_RUN_CAPACITY)).unwrap(),
|
||||
capacity: START_RUN_CAPACITY,
|
||||
len: 0,
|
||||
run_alloc_fn,
|
||||
run_dealloc_fn,
|
||||
}
|
||||
}
|
||||
|
||||
fn push(&mut self, val: TimSortRun) {
|
||||
if self.len == self.capacity {
|
||||
let old_capacity = self.capacity;
|
||||
let old_buf_ptr = self.buf_ptr.as_ptr();
|
||||
|
||||
self.capacity = self.capacity * 2;
|
||||
self.buf_ptr = ptr::NonNull::new((self.run_alloc_fn)(self.capacity)).unwrap();
|
||||
|
||||
// SAFETY: buf_ptr new and old were correctly allocated and old_buf_ptr has
|
||||
// old_capacity valid elements.
|
||||
unsafe {
|
||||
ptr::copy_nonoverlapping(old_buf_ptr, self.buf_ptr.as_ptr(), old_capacity);
|
||||
}
|
||||
|
||||
(self.run_dealloc_fn)(old_buf_ptr, old_capacity);
|
||||
}
|
||||
|
||||
// SAFETY: The invariant was just checked.
|
||||
unsafe {
|
||||
self.buf_ptr.as_ptr().add(self.len).write(val);
|
||||
}
|
||||
self.len += 1;
|
||||
}
|
||||
|
||||
fn remove(&mut self, index: usize) {
|
||||
if index >= self.len {
|
||||
panic!("Index out of bounds");
|
||||
}
|
||||
|
||||
// SAFETY: buf_ptr needs to be valid and len invariant upheld.
|
||||
unsafe {
|
||||
// the place we are taking from.
|
||||
let ptr = self.buf_ptr.as_ptr().add(index);
|
||||
|
||||
// Shift everything down to fill in that spot.
|
||||
ptr::copy(ptr.add(1), ptr, self.len - index - 1);
|
||||
}
|
||||
self.len -= 1;
|
||||
}
|
||||
|
||||
fn as_slice(&self) -> &[TimSortRun] {
|
||||
// SAFETY: Safe as long as buf_ptr is valid and len invariant was upheld.
|
||||
unsafe { &*ptr::slice_from_raw_parts(self.buf_ptr.as_ptr(), self.len) }
|
||||
}
|
||||
|
||||
fn len(&self) -> usize {
|
||||
self.len
|
||||
}
|
||||
}
|
||||
|
||||
impl<RunAllocF, RunDeallocF> core::ops::Index<usize> for RunVec<RunAllocF, RunDeallocF>
|
||||
where
|
||||
RunAllocF: Fn(usize) -> *mut TimSortRun,
|
||||
RunDeallocF: Fn(*mut TimSortRun, usize),
|
||||
{
|
||||
type Output = TimSortRun;
|
||||
|
||||
fn index(&self, index: usize) -> &Self::Output {
|
||||
if index < self.len {
|
||||
// SAFETY: buf_ptr and len invariant must be upheld.
|
||||
unsafe {
|
||||
return &*(self.buf_ptr.as_ptr().add(index));
|
||||
}
|
||||
}
|
||||
|
||||
panic!("Index out of bounds");
|
||||
}
|
||||
}
|
||||
|
||||
impl<RunAllocF, RunDeallocF> core::ops::IndexMut<usize> for RunVec<RunAllocF, RunDeallocF>
|
||||
where
|
||||
RunAllocF: Fn(usize) -> *mut TimSortRun,
|
||||
RunDeallocF: Fn(*mut TimSortRun, usize),
|
||||
{
|
||||
fn index_mut(&mut self, index: usize) -> &mut Self::Output {
|
||||
if index < self.len {
|
||||
// SAFETY: buf_ptr and len invariant must be upheld.
|
||||
unsafe {
|
||||
return &mut *(self.buf_ptr.as_ptr().add(index));
|
||||
}
|
||||
}
|
||||
|
||||
panic!("Index out of bounds");
|
||||
}
|
||||
}
|
||||
|
||||
impl<RunAllocF, RunDeallocF> Drop for RunVec<RunAllocF, RunDeallocF>
|
||||
where
|
||||
RunAllocF: Fn(usize) -> *mut TimSortRun,
|
||||
RunDeallocF: Fn(*mut TimSortRun, usize),
|
||||
{
|
||||
fn drop(&mut self) {
|
||||
// As long as TimSortRun is Copy we don't need to drop them individually but just the
|
||||
// whole allocation.
|
||||
(self.run_dealloc_fn)(self.buf_ptr.as_ptr(), self.capacity);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Internal type used by merge_sort.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub struct TimSortRun {
|
||||
len: usize,
|
||||
start: usize,
|
||||
}
|
||||
|
||||
/// Takes a range as denoted by start and end, that is already sorted and extends it to the right if
|
||||
/// necessary with sorts optimized for smaller ranges such as insertion sort.
|
||||
fn provide_sorted_batch<T, F>(v: &mut [T], start: usize, mut end: usize, is_less: &mut F) -> usize
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
let len = v.len();
|
||||
assert!(end >= start && end <= len);
|
||||
|
||||
// This value is a balance between least comparisons and best performance, as
|
||||
// influenced by for example cache locality.
|
||||
const MIN_INSERTION_RUN: usize = 10;
|
||||
|
||||
// Insert some more elements into the run if it's too short. Insertion sort is faster than
|
||||
// merge sort on short sequences, so this significantly improves performance.
|
||||
let start_end_diff = end - start;
|
||||
|
||||
if start_end_diff < MIN_INSERTION_RUN && end < len {
|
||||
// v[start_found..end] are elements that are already sorted in the input. We want to extend
|
||||
// the sorted region to the left, so we push up MIN_INSERTION_RUN - 1 to the right. Which is
|
||||
// more efficient that trying to push those already sorted elements to the left.
|
||||
end = cmp::min(start + MIN_INSERTION_RUN, len);
|
||||
let presorted_start = cmp::max(start_end_diff, 1);
|
||||
|
||||
insertion_sort_shift_left(&mut v[start..end], presorted_start, is_less);
|
||||
}
|
||||
|
||||
end
|
||||
}
|
||||
|
||||
/// Finds a streak of presorted elements starting at the beginning of the slice. Returns the first
|
||||
/// value that is not part of said streak, and a bool denoting whether the streak was reversed.
|
||||
/// Streaks can be increasing or decreasing.
|
||||
fn find_streak<T, F>(v: &[T], is_less: &mut F) -> (usize, bool)
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
let len = v.len();
|
||||
|
||||
if len < 2 {
|
||||
return (len, false);
|
||||
}
|
||||
|
||||
let mut end = 2;
|
||||
|
||||
// SAFETY: See below specific.
|
||||
unsafe {
|
||||
// SAFETY: We checked that len >= 2, so 0 and 1 are valid indices.
|
||||
let assume_reverse = is_less(v.get_unchecked(1), v.get_unchecked(0));
|
||||
|
||||
// SAFETY: We know end >= 2 and check end < len.
|
||||
// From that follows that accessing v at end and end - 1 is safe.
|
||||
if assume_reverse {
|
||||
while end < len && is_less(v.get_unchecked(end), v.get_unchecked(end - 1)) {
|
||||
end += 1;
|
||||
}
|
||||
|
||||
(end, true)
|
||||
} else {
|
||||
while end < len && !is_less(v.get_unchecked(end), v.get_unchecked(end - 1)) {
|
||||
end += 1;
|
||||
}
|
||||
(end, false)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,8 @@
|
||||
//! This module and the contained sub-modules contains the code for efficient and robust sort
|
||||
//! implementations, as well as the domain adjacent implementation of `select_nth_unstable`.
|
||||
|
||||
pub mod stable;
|
||||
pub mod unstable;
|
||||
|
||||
pub(crate) mod select;
|
||||
pub(crate) mod shared;
|
||||
@@ -1,45 +1,78 @@
|
||||
//! Slice selection
|
||||
//!
|
||||
//! This module contains the implementation for `slice::select_nth_unstable`.
|
||||
//! It uses an introselect algorithm based on Orson Peters' pattern-defeating quicksort,
|
||||
//! published at: <https://github.com/orlp/pdqsort>
|
||||
//! It uses an introselect algorithm based on ipnsort by Lukas Bergdoll and Orson Peters,
|
||||
//! published at: <https://github.com/Voultapher/sort-research-rs/tree/main/ipnsort>
|
||||
//!
|
||||
//! The fallback algorithm used for introselect is Median of Medians using Tukey's Ninther
|
||||
//! for pivot selection. Using this as a fallback ensures O(n) worst case running time with
|
||||
//! better performance than one would get using heapsort as fallback.
|
||||
|
||||
use crate::cmp;
|
||||
use crate::mem::{self, SizedTypeProperties};
|
||||
use crate::slice::sort::{
|
||||
break_patterns, choose_pivot, insertion_sort_shift_left, partition, partition_equal,
|
||||
};
|
||||
|
||||
// For slices of up to this length it's probably faster to simply sort them.
|
||||
// Defined at the module scope because it's used in multiple functions.
|
||||
const MAX_INSERTION: usize = 10;
|
||||
use crate::slice::sort::shared::pivot::choose_pivot;
|
||||
use crate::slice::sort::shared::smallsort::insertion_sort_shift_left;
|
||||
use crate::slice::sort::unstable::quicksort::partition;
|
||||
|
||||
/// Reorder the slice such that the element at `index` is at its final sorted position.
|
||||
pub(crate) fn partition_at_index<T, F>(
|
||||
v: &mut [T],
|
||||
index: usize,
|
||||
mut is_less: F,
|
||||
) -> (&mut [T], &mut T, &mut [T])
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
let len = v.len();
|
||||
|
||||
// Puts a lower limit of 1 on `len`.
|
||||
if index >= len {
|
||||
panic!("partition_at_index index {} greater than length of slice {}", index, len);
|
||||
}
|
||||
|
||||
if T::IS_ZST {
|
||||
// Sorting has no meaningful behavior on zero-sized types. Do nothing.
|
||||
} else if index == len - 1 {
|
||||
// Find max element and place it in the last position of the array. We're free to use
|
||||
// `unwrap()` here because we checked that `v` is not empty.
|
||||
let max_idx = max_index(v, &mut is_less).unwrap();
|
||||
v.swap(max_idx, index);
|
||||
} else if index == 0 {
|
||||
// Find min element and place it in the first position of the array. We're free to use
|
||||
// `unwrap()` here because we checked that `v` is not empty.
|
||||
let min_idx = min_index(v, &mut is_less).unwrap();
|
||||
v.swap(min_idx, index);
|
||||
} else {
|
||||
partition_at_index_loop(v, index, None, &mut is_less);
|
||||
}
|
||||
|
||||
let (left, right) = v.split_at_mut(index);
|
||||
let (pivot, right) = right.split_at_mut(1);
|
||||
let pivot = &mut pivot[0];
|
||||
(left, pivot, right)
|
||||
}
|
||||
|
||||
// For small sub-slices it's faster to use a dedicated small-sort, but because it is only called at
|
||||
// most once, it doesn't make sense to use something more sophisticated than insertion-sort.
|
||||
const INSERTION_SORT_THRESHOLD: usize = 16;
|
||||
|
||||
fn partition_at_index_loop<'a, T, F>(
|
||||
mut v: &'a mut [T],
|
||||
mut index: usize,
|
||||
mut ancestor_pivot: Option<&'a T>,
|
||||
is_less: &mut F,
|
||||
mut pred: Option<&'a T>,
|
||||
) where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
// Limit the amount of iterations and fall back to fast deterministic selection
|
||||
// to ensure O(n) worst case running time. This limit needs to be constant, because
|
||||
// using `ilog2(len)` like in `sort` would result in O(n log n) time complexity.
|
||||
// The exact value of the limit is chosen somewhat arbitrarily, but for most inputs bad pivot
|
||||
// selections should be relatively rare, so the limit usually shouldn't be reached
|
||||
// anyways.
|
||||
// Limit the amount of iterations and fall back to fast deterministic selection to ensure O(n)
|
||||
// worst case running time. This limit needs to be constant, because using `ilog2(len)` like in
|
||||
// `sort` would result in O(n log n) time complexity. The exact value of the limit is chosen
|
||||
// somewhat arbitrarily, but for most inputs bad pivot selections should be relatively rare, so
|
||||
// the limit is reached for sub-slices len / (2^limit or less). Which makes the remaining work
|
||||
// with the fallback minimal in relative terms.
|
||||
let mut limit = 16;
|
||||
|
||||
// True if the last partitioning was reasonably balanced.
|
||||
let mut was_balanced = true;
|
||||
|
||||
loop {
|
||||
if v.len() <= MAX_INSERTION {
|
||||
if v.len() > 1 {
|
||||
if v.len() <= INSERTION_SORT_THRESHOLD {
|
||||
if v.len() >= 2 {
|
||||
insertion_sort_shift_left(v, 1, is_less);
|
||||
}
|
||||
return;
|
||||
@@ -50,38 +83,35 @@ fn partition_at_index_loop<'a, T, F>(
|
||||
return;
|
||||
}
|
||||
|
||||
// If the last partitioning was imbalanced, try breaking patterns in the slice by shuffling
|
||||
// some elements around. Hopefully we'll choose a better pivot this time.
|
||||
if !was_balanced {
|
||||
break_patterns(v);
|
||||
limit -= 1;
|
||||
}
|
||||
limit -= 1;
|
||||
|
||||
// Choose a pivot
|
||||
let (pivot, _) = choose_pivot(v, is_less);
|
||||
let pivot_pos = choose_pivot(v, is_less);
|
||||
|
||||
// If the chosen pivot is equal to the predecessor, then it's the smallest element in the
|
||||
// slice. Partition the slice into elements equal to and elements greater than the pivot.
|
||||
// This case is usually hit when the slice contains many duplicate elements.
|
||||
if let Some(p) = pred {
|
||||
if !is_less(p, &v[pivot]) {
|
||||
let mid = partition_equal(v, pivot, is_less);
|
||||
if let Some(p) = ancestor_pivot {
|
||||
if !is_less(p, unsafe { v.get_unchecked(pivot_pos) }) {
|
||||
let num_lt = partition(v, pivot_pos, &mut |a, b| !is_less(b, a));
|
||||
|
||||
// Continue sorting elements greater than the pivot. We know that `mid` contains
|
||||
// the pivot. So we can continue after `mid`.
|
||||
let mid = num_lt + 1;
|
||||
|
||||
// If we've passed our index, then we're good.
|
||||
if mid > index {
|
||||
return;
|
||||
}
|
||||
|
||||
// Otherwise, continue sorting elements greater than the pivot.
|
||||
v = &mut v[mid..];
|
||||
index = index - mid;
|
||||
pred = None;
|
||||
ancestor_pivot = None;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
let (mid, _) = partition(v, pivot, is_less);
|
||||
was_balanced = cmp::min(mid, v.len() - mid) >= v.len() / 8;
|
||||
let mid = partition(v, pivot_pos, is_less);
|
||||
|
||||
// Split the slice into `left`, `pivot`, and `right`.
|
||||
let (left, right) = v.split_at_mut(mid);
|
||||
@@ -91,7 +121,7 @@ fn partition_at_index_loop<'a, T, F>(
|
||||
if mid < index {
|
||||
v = right;
|
||||
index = index - mid - 1;
|
||||
pred = Some(pivot);
|
||||
ancestor_pivot = Some(pivot);
|
||||
} else if mid > index {
|
||||
v = left;
|
||||
} else {
|
||||
@@ -122,41 +152,6 @@ fn max_index<T, F: FnMut(&T, &T) -> bool>(slice: &[T], is_less: &mut F) -> Optio
|
||||
.map(|(i, _)| i)
|
||||
}
|
||||
|
||||
/// Reorder the slice such that the element at `index` is at its final sorted position.
|
||||
pub fn partition_at_index<T, F>(
|
||||
v: &mut [T],
|
||||
index: usize,
|
||||
mut is_less: F,
|
||||
) -> (&mut [T], &mut T, &mut [T])
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
if index >= v.len() {
|
||||
panic!("partition_at_index index {} greater than length of slice {}", index, v.len());
|
||||
}
|
||||
|
||||
if T::IS_ZST {
|
||||
// Sorting has no meaningful behavior on zero-sized types. Do nothing.
|
||||
} else if index == v.len() - 1 {
|
||||
// Find max element and place it in the last position of the array. We're free to use
|
||||
// `unwrap()` here because we know v must not be empty.
|
||||
let max_idx = max_index(v, &mut is_less).unwrap();
|
||||
v.swap(max_idx, index);
|
||||
} else if index == 0 {
|
||||
// Find min element and place it in the first position of the array. We're free to use
|
||||
// `unwrap()` here because we know v must not be empty.
|
||||
let min_idx = min_index(v, &mut is_less).unwrap();
|
||||
v.swap(min_idx, index);
|
||||
} else {
|
||||
partition_at_index_loop(v, index, &mut is_less, None);
|
||||
}
|
||||
|
||||
let (left, right) = v.split_at_mut(index);
|
||||
let (pivot, right) = right.split_at_mut(1);
|
||||
let pivot = &mut pivot[0];
|
||||
(left, pivot, right)
|
||||
}
|
||||
|
||||
/// Selection algorithm to select the k-th element from the slice in guaranteed O(n) time.
|
||||
/// This is essentially a quickselect that uses Tukey's Ninther for pivot selection
|
||||
fn median_of_medians<T, F: FnMut(&T, &T) -> bool>(mut v: &mut [T], is_less: &mut F, mut k: usize) {
|
||||
@@ -168,8 +163,8 @@ fn median_of_medians<T, F: FnMut(&T, &T) -> bool>(mut v: &mut [T], is_less: &mut
|
||||
|
||||
// We now know that `k < v.len() <= isize::MAX`
|
||||
loop {
|
||||
if v.len() <= MAX_INSERTION {
|
||||
if v.len() > 1 {
|
||||
if v.len() <= INSERTION_SORT_THRESHOLD {
|
||||
if v.len() >= 2 {
|
||||
insertion_sort_shift_left(v, 1, is_less);
|
||||
}
|
||||
return;
|
||||
@@ -232,7 +227,8 @@ fn median_of_ninthers<T, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F)
|
||||
}
|
||||
|
||||
median_of_medians(&mut v[lo..lo + frac], is_less, pivot);
|
||||
partition(v, lo + pivot, is_less).0
|
||||
|
||||
partition(v, lo + pivot, is_less)
|
||||
}
|
||||
|
||||
/// Moves around the 9 elements at the indices a..i, such that
|
||||
@@ -0,0 +1,45 @@
|
||||
use crate::marker::Freeze;
|
||||
|
||||
pub(crate) mod pivot;
|
||||
pub(crate) mod smallsort;
|
||||
|
||||
/// SAFETY: this is safety relevant, how does this interact with the soundness holes in
|
||||
/// specialization?
|
||||
#[rustc_unsafe_specialization_marker]
|
||||
pub(crate) trait FreezeMarker {}
|
||||
|
||||
impl<T: Freeze> FreezeMarker for T {}
|
||||
|
||||
/// Finds a run of sorted elements starting at the beginning of the slice.
|
||||
///
|
||||
/// Returns the length of the run, and a bool that is false when the run
|
||||
/// is ascending, and true if the run strictly descending.
|
||||
#[inline(always)]
|
||||
pub(crate) fn find_existing_run<T, F: FnMut(&T, &T) -> bool>(
|
||||
v: &[T],
|
||||
is_less: &mut F,
|
||||
) -> (usize, bool) {
|
||||
let len = v.len();
|
||||
if len < 2 {
|
||||
return (len, false);
|
||||
}
|
||||
|
||||
// SAFETY: We checked that len >= 2, so 0 and 1 are valid indices.
|
||||
// This also means that run_len < len implies run_len and run_len - 1
|
||||
// are valid indices as well.
|
||||
unsafe {
|
||||
let mut run_len = 2;
|
||||
let strictly_descending = is_less(v.get_unchecked(1), v.get_unchecked(0));
|
||||
if strictly_descending {
|
||||
while run_len < len && is_less(v.get_unchecked(run_len), v.get_unchecked(run_len - 1)) {
|
||||
run_len += 1;
|
||||
}
|
||||
} else {
|
||||
while run_len < len && !is_less(v.get_unchecked(run_len), v.get_unchecked(run_len - 1))
|
||||
{
|
||||
run_len += 1;
|
||||
}
|
||||
}
|
||||
(run_len, strictly_descending)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,88 @@
|
||||
//! This module contains the logic for pivot selection.
|
||||
|
||||
use crate::intrinsics;
|
||||
|
||||
// Recursively select a pseudomedian if above this threshold.
|
||||
const PSEUDO_MEDIAN_REC_THRESHOLD: usize = 64;
|
||||
|
||||
/// Selects a pivot from `v`. Algorithm taken from glidesort by Orson Peters.
|
||||
///
|
||||
/// This chooses a pivot by sampling an adaptive amount of points, approximating
|
||||
/// the quality of a median of sqrt(n) elements.
|
||||
pub fn choose_pivot<T, F: FnMut(&T, &T) -> bool>(v: &[T], is_less: &mut F) -> usize {
|
||||
// We use unsafe code and raw pointers here because we're dealing with
|
||||
// heavy recursion. Passing safe slices around would involve a lot of
|
||||
// branches and function call overhead.
|
||||
|
||||
let len = v.len();
|
||||
if len < 8 {
|
||||
intrinsics::abort();
|
||||
}
|
||||
|
||||
// SAFETY: a, b, c point to initialized regions of len_div_8 elements,
|
||||
// satisfying median3 and median3_rec's preconditions as v_base points
|
||||
// to an initialized region of n = len elements.
|
||||
unsafe {
|
||||
let v_base = v.as_ptr();
|
||||
let len_div_8 = len / 8;
|
||||
|
||||
let a = v_base; // [0, floor(n/8))
|
||||
let b = v_base.add(len_div_8 * 4); // [4*floor(n/8), 5*floor(n/8))
|
||||
let c = v_base.add(len_div_8 * 7); // [7*floor(n/8), 8*floor(n/8))
|
||||
|
||||
if len < PSEUDO_MEDIAN_REC_THRESHOLD {
|
||||
median3(&*a, &*b, &*c, is_less).sub_ptr(v_base)
|
||||
} else {
|
||||
median3_rec(a, b, c, len_div_8, is_less).sub_ptr(v_base)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculates an approximate median of 3 elements from sections a, b, c, or
|
||||
/// recursively from an approximation of each, if they're large enough. By
|
||||
/// dividing the size of each section by 8 when recursing we have logarithmic
|
||||
/// recursion depth and overall sample from f(n) = 3*f(n/8) -> f(n) =
|
||||
/// O(n^(log(3)/log(8))) ~= O(n^0.528) elements.
|
||||
///
|
||||
/// SAFETY: a, b, c must point to the start of initialized regions of memory of
|
||||
/// at least n elements.
|
||||
unsafe fn median3_rec<T, F: FnMut(&T, &T) -> bool>(
|
||||
mut a: *const T,
|
||||
mut b: *const T,
|
||||
mut c: *const T,
|
||||
n: usize,
|
||||
is_less: &mut F,
|
||||
) -> *const T {
|
||||
// SAFETY: a, b, c still point to initialized regions of n / 8 elements,
|
||||
// by the exact same logic as in choose_pivot.
|
||||
unsafe {
|
||||
if n * 8 >= PSEUDO_MEDIAN_REC_THRESHOLD {
|
||||
let n8 = n / 8;
|
||||
a = median3_rec(a, a.add(n8 * 4), a.add(n8 * 7), n8, is_less);
|
||||
b = median3_rec(b, b.add(n8 * 4), b.add(n8 * 7), n8, is_less);
|
||||
c = median3_rec(c, c.add(n8 * 4), c.add(n8 * 7), n8, is_less);
|
||||
}
|
||||
median3(&*a, &*b, &*c, is_less)
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculates the median of 3 elements.
|
||||
///
|
||||
/// SAFETY: a, b, c must be valid initialized elements.
|
||||
#[inline(always)]
|
||||
fn median3<T, F: FnMut(&T, &T) -> bool>(a: &T, b: &T, c: &T, is_less: &mut F) -> *const T {
|
||||
// Compiler tends to make this branchless when sensible, and avoids the
|
||||
// third comparison when not.
|
||||
let x = is_less(a, b);
|
||||
let y = is_less(a, c);
|
||||
if x == y {
|
||||
// If x=y=0 then b, c <= a. In this case we want to return max(b, c).
|
||||
// If x=y=1 then a < b, c. In this case we want to return min(b, c).
|
||||
// By toggling the outcome of b < c using XOR x we get this behavior.
|
||||
let z = is_less(b, c);
|
||||
if z ^ x { c } else { b }
|
||||
} else {
|
||||
// Either c <= a < b or b <= a < c, thus a is our median.
|
||||
a
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,843 @@
|
||||
//! This module contains a variety of sort implementations that are optimized for small lengths.
|
||||
|
||||
use crate::intrinsics;
|
||||
use crate::mem::{self, ManuallyDrop, MaybeUninit};
|
||||
use crate::ptr;
|
||||
use crate::slice;
|
||||
|
||||
use crate::slice::sort::shared::FreezeMarker;
|
||||
|
||||
// It's important to differentiate between SMALL_SORT_THRESHOLD performance for
|
||||
// small slices and small-sort performance sorting small sub-slices as part of
|
||||
// the main quicksort loop. For the former, testing showed that the
|
||||
// representative benchmarks for real-world performance are cold CPU state and
|
||||
// not single-size hot benchmarks. For the latter the CPU will call them many
|
||||
// times, so hot benchmarks are fine and more realistic. And it's worth it to
|
||||
// optimize sorting small sub-slices with more sophisticated solutions than
|
||||
// insertion sort.
|
||||
|
||||
/// Using a trait allows us to specialize on `Freeze` which in turn allows us to make safe
|
||||
/// abstractions.
|
||||
pub(crate) trait StableSmallSortTypeImpl: Sized {
|
||||
/// For which input length <= return value of this function, is it valid to call `small_sort`.
|
||||
fn small_sort_threshold() -> usize;
|
||||
|
||||
/// Sorts `v` using strategies optimized for small sizes.
|
||||
fn small_sort<F: FnMut(&Self, &Self) -> bool>(
|
||||
v: &mut [Self],
|
||||
scratch: &mut [MaybeUninit<Self>],
|
||||
is_less: &mut F,
|
||||
);
|
||||
}
|
||||
|
||||
impl<T> StableSmallSortTypeImpl for T {
|
||||
#[inline(always)]
|
||||
default fn small_sort_threshold() -> usize {
|
||||
// Optimal number of comparisons, and good perf.
|
||||
SMALL_SORT_FALLBACK_THRESHOLD
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
default fn small_sort<F: FnMut(&T, &T) -> bool>(
|
||||
v: &mut [T],
|
||||
_scratch: &mut [MaybeUninit<T>],
|
||||
is_less: &mut F,
|
||||
) {
|
||||
if v.len() >= 2 {
|
||||
insertion_sort_shift_left(v, 1, is_less);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: FreezeMarker> StableSmallSortTypeImpl for T {
|
||||
#[inline(always)]
|
||||
fn small_sort_threshold() -> usize {
|
||||
SMALL_SORT_GENERAL_THRESHOLD
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn small_sort<F: FnMut(&T, &T) -> bool>(
|
||||
v: &mut [T],
|
||||
scratch: &mut [MaybeUninit<T>],
|
||||
is_less: &mut F,
|
||||
) {
|
||||
small_sort_general_with_scratch(v, scratch, is_less);
|
||||
}
|
||||
}
|
||||
|
||||
/// Using a trait allows us to specialize on `Freeze` which in turn allows us to make safe
|
||||
/// abstractions.
|
||||
pub(crate) trait UnstableSmallSortTypeImpl: Sized {
|
||||
/// For which input length <= return value of this function, is it valid to call `small_sort`.
|
||||
fn small_sort_threshold() -> usize;
|
||||
|
||||
/// Sorts `v` using strategies optimized for small sizes.
|
||||
fn small_sort<F: FnMut(&Self, &Self) -> bool>(v: &mut [Self], is_less: &mut F);
|
||||
}
|
||||
|
||||
impl<T> UnstableSmallSortTypeImpl for T {
|
||||
#[inline(always)]
|
||||
default fn small_sort_threshold() -> usize {
|
||||
SMALL_SORT_FALLBACK_THRESHOLD
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
default fn small_sort<F>(v: &mut [T], is_less: &mut F)
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
small_sort_fallback(v, is_less);
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: FreezeMarker> UnstableSmallSortTypeImpl for T {
|
||||
#[inline(always)]
|
||||
fn small_sort_threshold() -> usize {
|
||||
match const { choose_unstable_small_sort::<T>() } {
|
||||
UnstalbeSmallSort::Fallback => SMALL_SORT_FALLBACK_THRESHOLD,
|
||||
UnstalbeSmallSort::General => SMALL_SORT_GENERAL_THRESHOLD,
|
||||
UnstalbeSmallSort::Network => SMALL_SORT_NETWORK_THRESHOLD,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn small_sort<F>(v: &mut [T], is_less: &mut F)
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
// This construct is used to limit the LLVM IR generated, which saves large amounts of
|
||||
// compile-time by only instantiating the code that is needed. Idea by Frank Steffahn.
|
||||
(const { inst_unstable_small_sort::<T, F>() })(v, is_less);
|
||||
}
|
||||
}
|
||||
|
||||
/// Optimal number of comparisons, and good perf.
|
||||
const SMALL_SORT_FALLBACK_THRESHOLD: usize = 16;
|
||||
|
||||
/// From a comparison perspective 20 was ~2% more efficient for fully random input, but for
|
||||
/// wall-clock performance choosing 32 yielded better performance overall.
|
||||
///
|
||||
/// SAFETY: If you change this value, you have to adjust [`small_sort_general`] !
|
||||
const SMALL_SORT_GENERAL_THRESHOLD: usize = 32;
|
||||
|
||||
/// [`small_sort_general`] uses [`sort8_stable`] as primitive and does a kind of ping-pong merge,
|
||||
/// where the output of the first two [`sort8_stable`] calls is stored at the end of the scratch
|
||||
/// buffer. This simplifies panic handling and avoids additional copies. This affects the required
|
||||
/// scratch buffer size.
|
||||
///
|
||||
/// SAFETY: If you change this value, you have to adjust [`small_sort_general`] !
|
||||
pub(crate) const SMALL_SORT_GENERAL_SCRATCH_LEN: usize = SMALL_SORT_GENERAL_THRESHOLD + 16;
|
||||
|
||||
/// SAFETY: If you change this value, you have to adjust [`small_sort_network`] !
|
||||
const SMALL_SORT_NETWORK_THRESHOLD: usize = 32;
|
||||
const SMALL_SORT_NETWORK_SCRATCH_LEN: usize = SMALL_SORT_NETWORK_THRESHOLD;
|
||||
|
||||
/// Using a stack array, could cause a stack overflow if the type `T` is very large. To be
|
||||
/// conservative we limit the usage of small-sorts that require a stack array to types that fit
|
||||
/// within this limit.
|
||||
const MAX_STACK_ARRAY_SIZE: usize = 4096;
|
||||
|
||||
enum UnstalbeSmallSort {
|
||||
Fallback,
|
||||
General,
|
||||
Network,
|
||||
}
|
||||
|
||||
const fn choose_unstable_small_sort<T: FreezeMarker>() -> UnstalbeSmallSort {
|
||||
if T::is_copy()
|
||||
&& has_efficient_in_place_swap::<T>()
|
||||
&& (mem::size_of::<T>() * SMALL_SORT_NETWORK_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE
|
||||
{
|
||||
// Heuristic for int like types.
|
||||
return UnstalbeSmallSort::Network;
|
||||
}
|
||||
|
||||
if (mem::size_of::<T>() * SMALL_SORT_GENERAL_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE {
|
||||
return UnstalbeSmallSort::General;
|
||||
}
|
||||
|
||||
UnstalbeSmallSort::Fallback
|
||||
}
|
||||
|
||||
const fn inst_unstable_small_sort<T: FreezeMarker, F: FnMut(&T, &T) -> bool>()
|
||||
-> fn(&mut [T], &mut F) {
|
||||
match const { choose_unstable_small_sort::<T>() } {
|
||||
UnstalbeSmallSort::Fallback => small_sort_fallback::<T, F>,
|
||||
UnstalbeSmallSort::General => small_sort_general::<T, F>,
|
||||
UnstalbeSmallSort::Network => small_sort_network::<T, F>,
|
||||
}
|
||||
}
|
||||
|
||||
fn small_sort_fallback<T, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F) {
|
||||
if v.len() >= 2 {
|
||||
insertion_sort_shift_left(v, 1, is_less);
|
||||
}
|
||||
}
|
||||
|
||||
fn small_sort_general<T: FreezeMarker, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F) {
|
||||
let mut stack_array = MaybeUninit::<[T; SMALL_SORT_GENERAL_SCRATCH_LEN]>::uninit();
|
||||
|
||||
let scratch = unsafe {
|
||||
slice::from_raw_parts_mut(
|
||||
stack_array.as_mut_ptr() as *mut MaybeUninit<T>,
|
||||
SMALL_SORT_GENERAL_SCRATCH_LEN,
|
||||
)
|
||||
};
|
||||
|
||||
small_sort_general_with_scratch(v, scratch, is_less);
|
||||
}
|
||||
|
||||
fn small_sort_general_with_scratch<T: FreezeMarker, F: FnMut(&T, &T) -> bool>(
|
||||
v: &mut [T],
|
||||
scratch: &mut [MaybeUninit<T>],
|
||||
is_less: &mut F,
|
||||
) {
|
||||
let len = v.len();
|
||||
if len < 2 {
|
||||
return;
|
||||
}
|
||||
|
||||
if scratch.len() < len + 16 {
|
||||
intrinsics::abort();
|
||||
}
|
||||
|
||||
let v_base = v.as_mut_ptr();
|
||||
let len_div_2 = len / 2;
|
||||
|
||||
// SAFETY: See individual comments.
|
||||
unsafe {
|
||||
let scratch_base = scratch.as_mut_ptr() as *mut T;
|
||||
|
||||
let presorted_len = if const { mem::size_of::<T>() <= 16 } && len >= 16 {
|
||||
// SAFETY: scratch_base is valid and has enough space.
|
||||
sort8_stable(v_base, scratch_base, scratch_base.add(len), is_less);
|
||||
sort8_stable(
|
||||
v_base.add(len_div_2),
|
||||
scratch_base.add(len_div_2),
|
||||
scratch_base.add(len + 8),
|
||||
is_less,
|
||||
);
|
||||
|
||||
8
|
||||
} else if len >= 8 {
|
||||
// SAFETY: scratch_base is valid and has enough space.
|
||||
sort4_stable(v_base, scratch_base, is_less);
|
||||
sort4_stable(v_base.add(len_div_2), scratch_base.add(len_div_2), is_less);
|
||||
|
||||
4
|
||||
} else {
|
||||
ptr::copy_nonoverlapping(v_base, scratch_base, 1);
|
||||
ptr::copy_nonoverlapping(v_base.add(len_div_2), scratch_base.add(len_div_2), 1);
|
||||
|
||||
1
|
||||
};
|
||||
|
||||
for offset in [0, len_div_2] {
|
||||
// SAFETY: at this point dst is initialized with presorted_len elements.
|
||||
// We extend this to desired_len, src is valid for desired_len elements.
|
||||
let src = v_base.add(offset);
|
||||
let dst = scratch_base.add(offset);
|
||||
let desired_len = if offset == 0 { len_div_2 } else { len - len_div_2 };
|
||||
|
||||
for i in presorted_len..desired_len {
|
||||
ptr::copy_nonoverlapping(src.add(i), dst.add(i), 1);
|
||||
insert_tail(dst, dst.add(i), is_less);
|
||||
}
|
||||
}
|
||||
|
||||
// SAFETY: see comment in `CopyOnDrop::drop`.
|
||||
let drop_guard = CopyOnDrop { src: scratch_base, dst: v_base, len };
|
||||
|
||||
// SAFETY: at this point scratch_base is fully initialized, allowing us
|
||||
// to use it as the source of our merge back into the original array.
|
||||
// If a panic occurs we ensure the original array is restored to a valid
|
||||
// permutation of the input through drop_guard. This technique is similar
|
||||
// to ping-pong merging.
|
||||
bidirectional_merge(
|
||||
&*ptr::slice_from_raw_parts(drop_guard.src, drop_guard.len),
|
||||
drop_guard.dst,
|
||||
is_less,
|
||||
);
|
||||
mem::forget(drop_guard);
|
||||
}
|
||||
}
|
||||
|
||||
struct CopyOnDrop<T> {
|
||||
src: *const T,
|
||||
dst: *mut T,
|
||||
len: usize,
|
||||
}
|
||||
|
||||
impl<T> Drop for CopyOnDrop<T> {
|
||||
fn drop(&mut self) {
|
||||
// SAFETY: `src` must contain `len` initialized elements, and dst must
|
||||
// be valid to write `len` elements.
|
||||
unsafe {
|
||||
ptr::copy_nonoverlapping(self.src, self.dst, self.len);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn small_sort_network<T, F>(v: &mut [T], is_less: &mut F)
|
||||
where
|
||||
T: FreezeMarker,
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
// This implementation is tuned to be efficient for integer types.
|
||||
|
||||
let len = v.len();
|
||||
if len < 2 {
|
||||
return;
|
||||
}
|
||||
|
||||
if len > SMALL_SORT_NETWORK_SCRATCH_LEN {
|
||||
intrinsics::abort();
|
||||
}
|
||||
|
||||
let mut stack_array = MaybeUninit::<[T; SMALL_SORT_NETWORK_SCRATCH_LEN]>::uninit();
|
||||
|
||||
let len_div_2 = len / 2;
|
||||
let no_merge = len < 18;
|
||||
|
||||
let v_base = v.as_mut_ptr();
|
||||
let initial_region_len = if no_merge { len } else { len_div_2 };
|
||||
// SAFETY: Both possible values of `initial_region_len` are in-bounds.
|
||||
let mut region = unsafe { &mut *ptr::slice_from_raw_parts_mut(v_base, initial_region_len) };
|
||||
|
||||
// Avoid compiler unrolling, we *really* don't want that to happen here for binary-size reasons.
|
||||
loop {
|
||||
let presorted_len = if region.len() >= 13 {
|
||||
sort13_optimal(region, is_less);
|
||||
13
|
||||
} else if region.len() >= 9 {
|
||||
sort9_optimal(region, is_less);
|
||||
9
|
||||
} else {
|
||||
1
|
||||
};
|
||||
|
||||
insertion_sort_shift_left(region, presorted_len, is_less);
|
||||
|
||||
if no_merge {
|
||||
return;
|
||||
}
|
||||
|
||||
if region.as_ptr() != v_base {
|
||||
break;
|
||||
}
|
||||
|
||||
// SAFETY: The right side of `v` based on `len_div_2` is guaranteed in-bounds.
|
||||
region =
|
||||
unsafe { &mut *ptr::slice_from_raw_parts_mut(v_base.add(len_div_2), len - len_div_2) };
|
||||
}
|
||||
|
||||
// SAFETY: We checked that T is Freeze and thus observation safe.
|
||||
// Should is_less panic v was not modified in parity_merge and retains it's original input.
|
||||
// scratch and v must not alias and scratch has v.len() space.
|
||||
unsafe {
|
||||
let scratch_base = stack_array.as_mut_ptr() as *mut T;
|
||||
bidirectional_merge(
|
||||
&mut *ptr::slice_from_raw_parts_mut(v_base, len),
|
||||
scratch_base,
|
||||
is_less,
|
||||
);
|
||||
ptr::copy_nonoverlapping(scratch_base, v_base, len);
|
||||
}
|
||||
}
|
||||
|
||||
/// Swap two values in the slice pointed to by `v_base` at the position `a_pos` and `b_pos` if the
|
||||
/// value at position `b_pos` is less than the one at position `a_pos`.
|
||||
pub unsafe fn swap_if_less<T, F>(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F)
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
// SAFETY: the caller must guarantee that `a` and `b` each added to `v_base` yield valid
|
||||
// pointers into `v_base`, and are properly aligned, and part of the same allocation.
|
||||
unsafe {
|
||||
let v_a = v_base.add(a_pos);
|
||||
let v_b = v_base.add(b_pos);
|
||||
|
||||
// PANIC SAFETY: if is_less panics, no scratch memory was created and the slice should still be
|
||||
// in a well defined state, without duplicates.
|
||||
|
||||
// Important to only swap if it is more and not if it is equal. is_less should return false for
|
||||
// equal, so we don't swap.
|
||||
let should_swap = is_less(&*v_b, &*v_a);
|
||||
|
||||
// This is a branchless version of swap if.
|
||||
// The equivalent code with a branch would be:
|
||||
//
|
||||
// if should_swap {
|
||||
// ptr::swap(left, right, 1);
|
||||
// }
|
||||
|
||||
// The goal is to generate cmov instructions here.
|
||||
let left_swap = if should_swap { v_b } else { v_a };
|
||||
let right_swap = if should_swap { v_a } else { v_b };
|
||||
|
||||
let right_swap_tmp = ManuallyDrop::new(ptr::read(right_swap));
|
||||
ptr::copy(left_swap, v_a, 1);
|
||||
ptr::copy_nonoverlapping(&*right_swap_tmp, v_b, 1);
|
||||
}
|
||||
}
|
||||
|
||||
// Never inline this function to avoid code bloat. It still optimizes nicely and has practically no
|
||||
// performance impact.
|
||||
fn sort9_optimal<T, F>(v: &mut [T], is_less: &mut F)
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
if v.len() < 9 {
|
||||
intrinsics::abort();
|
||||
}
|
||||
|
||||
let v_base = v.as_mut_ptr();
|
||||
|
||||
// Optimal sorting network see:
|
||||
// https://bertdobbelaere.github.io/sorting_networks.html.
|
||||
|
||||
// SAFETY: We checked the len.
|
||||
unsafe {
|
||||
swap_if_less(v_base, 0, 3, is_less);
|
||||
swap_if_less(v_base, 1, 7, is_less);
|
||||
swap_if_less(v_base, 2, 5, is_less);
|
||||
swap_if_less(v_base, 4, 8, is_less);
|
||||
swap_if_less(v_base, 0, 7, is_less);
|
||||
swap_if_less(v_base, 2, 4, is_less);
|
||||
swap_if_less(v_base, 3, 8, is_less);
|
||||
swap_if_less(v_base, 5, 6, is_less);
|
||||
swap_if_less(v_base, 0, 2, is_less);
|
||||
swap_if_less(v_base, 1, 3, is_less);
|
||||
swap_if_less(v_base, 4, 5, is_less);
|
||||
swap_if_less(v_base, 7, 8, is_less);
|
||||
swap_if_less(v_base, 1, 4, is_less);
|
||||
swap_if_less(v_base, 3, 6, is_less);
|
||||
swap_if_less(v_base, 5, 7, is_less);
|
||||
swap_if_less(v_base, 0, 1, is_less);
|
||||
swap_if_less(v_base, 2, 4, is_less);
|
||||
swap_if_less(v_base, 3, 5, is_less);
|
||||
swap_if_less(v_base, 6, 8, is_less);
|
||||
swap_if_less(v_base, 2, 3, is_less);
|
||||
swap_if_less(v_base, 4, 5, is_less);
|
||||
swap_if_less(v_base, 6, 7, is_less);
|
||||
swap_if_less(v_base, 1, 2, is_less);
|
||||
swap_if_less(v_base, 3, 4, is_less);
|
||||
swap_if_less(v_base, 5, 6, is_less);
|
||||
}
|
||||
}
|
||||
|
||||
// Never inline this function to avoid code bloat. It still optimizes nicely and has practically no
|
||||
// performance impact.
|
||||
fn sort13_optimal<T, F>(v: &mut [T], is_less: &mut F)
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
if v.len() < 13 {
|
||||
intrinsics::abort();
|
||||
}
|
||||
|
||||
let v_base = v.as_mut_ptr();
|
||||
|
||||
// Optimal sorting network see:
|
||||
// https://bertdobbelaere.github.io/sorting_networks.html.
|
||||
|
||||
// SAFETY: We checked the len.
|
||||
unsafe {
|
||||
swap_if_less(v_base, 0, 12, is_less);
|
||||
swap_if_less(v_base, 1, 10, is_less);
|
||||
swap_if_less(v_base, 2, 9, is_less);
|
||||
swap_if_less(v_base, 3, 7, is_less);
|
||||
swap_if_less(v_base, 5, 11, is_less);
|
||||
swap_if_less(v_base, 6, 8, is_less);
|
||||
swap_if_less(v_base, 1, 6, is_less);
|
||||
swap_if_less(v_base, 2, 3, is_less);
|
||||
swap_if_less(v_base, 4, 11, is_less);
|
||||
swap_if_less(v_base, 7, 9, is_less);
|
||||
swap_if_less(v_base, 8, 10, is_less);
|
||||
swap_if_less(v_base, 0, 4, is_less);
|
||||
swap_if_less(v_base, 1, 2, is_less);
|
||||
swap_if_less(v_base, 3, 6, is_less);
|
||||
swap_if_less(v_base, 7, 8, is_less);
|
||||
swap_if_less(v_base, 9, 10, is_less);
|
||||
swap_if_less(v_base, 11, 12, is_less);
|
||||
swap_if_less(v_base, 4, 6, is_less);
|
||||
swap_if_less(v_base, 5, 9, is_less);
|
||||
swap_if_less(v_base, 8, 11, is_less);
|
||||
swap_if_less(v_base, 10, 12, is_less);
|
||||
swap_if_less(v_base, 0, 5, is_less);
|
||||
swap_if_less(v_base, 3, 8, is_less);
|
||||
swap_if_less(v_base, 4, 7, is_less);
|
||||
swap_if_less(v_base, 6, 11, is_less);
|
||||
swap_if_less(v_base, 9, 10, is_less);
|
||||
swap_if_less(v_base, 0, 1, is_less);
|
||||
swap_if_less(v_base, 2, 5, is_less);
|
||||
swap_if_less(v_base, 6, 9, is_less);
|
||||
swap_if_less(v_base, 7, 8, is_less);
|
||||
swap_if_less(v_base, 10, 11, is_less);
|
||||
swap_if_less(v_base, 1, 3, is_less);
|
||||
swap_if_less(v_base, 2, 4, is_less);
|
||||
swap_if_less(v_base, 5, 6, is_less);
|
||||
swap_if_less(v_base, 9, 10, is_less);
|
||||
swap_if_less(v_base, 1, 2, is_less);
|
||||
swap_if_less(v_base, 3, 4, is_less);
|
||||
swap_if_less(v_base, 5, 7, is_less);
|
||||
swap_if_less(v_base, 6, 8, is_less);
|
||||
swap_if_less(v_base, 2, 3, is_less);
|
||||
swap_if_less(v_base, 4, 5, is_less);
|
||||
swap_if_less(v_base, 6, 7, is_less);
|
||||
swap_if_less(v_base, 8, 9, is_less);
|
||||
swap_if_less(v_base, 3, 4, is_less);
|
||||
swap_if_less(v_base, 5, 6, is_less);
|
||||
}
|
||||
}
|
||||
|
||||
/// Sorts range [begin, tail] assuming [begin, tail) is already sorted.
|
||||
///
|
||||
/// # Safety
|
||||
/// begin < tail and p must be valid and initialized for all begin <= p <= tail.
|
||||
unsafe fn insert_tail<T, F: FnMut(&T, &T) -> bool>(begin: *mut T, tail: *mut T, is_less: &mut F) {
|
||||
// SAFETY: see individual comments.
|
||||
unsafe {
|
||||
// SAFETY: in-bounds as tail > begin.
|
||||
let mut sift = tail.sub(1);
|
||||
if !is_less(&*tail, &*sift) {
|
||||
return;
|
||||
}
|
||||
|
||||
// SAFETY: after this read tail is never read from again, as we only ever
|
||||
// read from sift, sift < tail and we only ever decrease sift. Thus this is
|
||||
// effectively a move, not a copy. Should a panic occur, or we have found
|
||||
// the correct insertion position, gap_guard ensures the element is moved
|
||||
// back into the array.
|
||||
let tmp = ManuallyDrop::new(tail.read());
|
||||
let mut gap_guard = CopyOnDrop { src: &*tmp, dst: tail, len: 1 };
|
||||
|
||||
loop {
|
||||
// SAFETY: we move sift into the gap (which is valid), and point the
|
||||
// gap guard destination at sift, ensuring that if a panic occurs the
|
||||
// gap is once again filled.
|
||||
ptr::copy_nonoverlapping(sift, gap_guard.dst, 1);
|
||||
gap_guard.dst = sift;
|
||||
|
||||
if sift == begin {
|
||||
break;
|
||||
}
|
||||
|
||||
// SAFETY: we checked that sift != begin, thus this is in-bounds.
|
||||
sift = sift.sub(1);
|
||||
if !is_less(&tmp, &*sift) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Sort `v` assuming `v[..offset]` is already sorted.
|
||||
pub fn insertion_sort_shift_left<T, F: FnMut(&T, &T) -> bool>(
|
||||
v: &mut [T],
|
||||
offset: usize,
|
||||
is_less: &mut F,
|
||||
) {
|
||||
let len = v.len();
|
||||
if offset == 0 || offset > len {
|
||||
intrinsics::abort();
|
||||
}
|
||||
|
||||
// SAFETY: see individual comments.
|
||||
unsafe {
|
||||
// We write this basic loop directly using pointers, as when we use a
|
||||
// for loop LLVM likes to unroll this loop which we do not want.
|
||||
// SAFETY: v_end is the one-past-end pointer, and we checked that
|
||||
// offset <= len, thus tail is also in-bounds.
|
||||
let v_base = v.as_mut_ptr();
|
||||
let v_end = v_base.add(len);
|
||||
let mut tail = v_base.add(offset);
|
||||
while tail != v_end {
|
||||
// SAFETY: v_base and tail are both valid pointers to elements, and
|
||||
// v_base < tail since we checked offset != 0.
|
||||
insert_tail(v_base, tail, is_less);
|
||||
|
||||
// SAFETY: we checked that tail is not yet the one-past-end pointer.
|
||||
tail = tail.add(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// SAFETY: The caller MUST guarantee that `v_base` is valid for 4 reads and
|
||||
/// `dst` is valid for 4 writes. The result will be stored in `dst[0..4]`.
|
||||
pub unsafe fn sort4_stable<T, F: FnMut(&T, &T) -> bool>(
|
||||
v_base: *const T,
|
||||
dst: *mut T,
|
||||
is_less: &mut F,
|
||||
) {
|
||||
// By limiting select to picking pointers, we are guaranteed good cmov code-gen
|
||||
// regardless of type T's size. Further this only does 5 instead of 6
|
||||
// comparisons compared to a stable transposition 4 element sorting-network,
|
||||
// and always copies each element exactly once.
|
||||
|
||||
// SAFETY: all pointers have offset at most 3 from v_base and dst, and are
|
||||
// thus in-bounds by the precondition.
|
||||
unsafe {
|
||||
// Stably create two pairs a <= b and c <= d.
|
||||
let c1 = is_less(&*v_base.add(1), &*v_base);
|
||||
let c2 = is_less(&*v_base.add(3), &*v_base.add(2));
|
||||
let a = v_base.add(c1 as usize);
|
||||
let b = v_base.add(!c1 as usize);
|
||||
let c = v_base.add(2 + c2 as usize);
|
||||
let d = v_base.add(2 + (!c2 as usize));
|
||||
|
||||
// Compare (a, c) and (b, d) to identify max/min. We're left with two
|
||||
// unknown elements, but because we are a stable sort we must know which
|
||||
// one is leftmost and which one is rightmost.
|
||||
// c3, c4 | min max unknown_left unknown_right
|
||||
// 0, 0 | a d b c
|
||||
// 0, 1 | a b c d
|
||||
// 1, 0 | c d a b
|
||||
// 1, 1 | c b a d
|
||||
let c3 = is_less(&*c, &*a);
|
||||
let c4 = is_less(&*d, &*b);
|
||||
let min = select(c3, c, a);
|
||||
let max = select(c4, b, d);
|
||||
let unknown_left = select(c3, a, select(c4, c, b));
|
||||
let unknown_right = select(c4, d, select(c3, b, c));
|
||||
|
||||
// Sort the last two unknown elements.
|
||||
let c5 = is_less(&*unknown_right, &*unknown_left);
|
||||
let lo = select(c5, unknown_right, unknown_left);
|
||||
let hi = select(c5, unknown_left, unknown_right);
|
||||
|
||||
ptr::copy_nonoverlapping(min, dst, 1);
|
||||
ptr::copy_nonoverlapping(lo, dst.add(1), 1);
|
||||
ptr::copy_nonoverlapping(hi, dst.add(2), 1);
|
||||
ptr::copy_nonoverlapping(max, dst.add(3), 1);
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn select<T>(cond: bool, if_true: *const T, if_false: *const T) -> *const T {
|
||||
if cond { if_true } else { if_false }
|
||||
}
|
||||
}
|
||||
|
||||
/// SAFETY: The caller MUST guarantee that `v_base` is valid for 8 reads and
|
||||
/// writes, `scratch_base` and `dst` MUST be valid for 8 writes. The result will
|
||||
/// be stored in `dst[0..8]`.
|
||||
unsafe fn sort8_stable<T: FreezeMarker, F: FnMut(&T, &T) -> bool>(
|
||||
v_base: *mut T,
|
||||
dst: *mut T,
|
||||
scratch_base: *mut T,
|
||||
is_less: &mut F,
|
||||
) {
|
||||
// SAFETY: these pointers are all in-bounds by the precondition of our function.
|
||||
unsafe {
|
||||
sort4_stable(v_base, scratch_base, is_less);
|
||||
sort4_stable(v_base.add(4), scratch_base.add(4), is_less);
|
||||
}
|
||||
|
||||
// SAFETY: scratch_base[0..8] is now initialized, allowing us to merge back
|
||||
// into dst.
|
||||
unsafe {
|
||||
bidirectional_merge(&*ptr::slice_from_raw_parts(scratch_base, 8), dst, is_less);
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
unsafe fn merge_up<T, F: FnMut(&T, &T) -> bool>(
|
||||
mut left_src: *const T,
|
||||
mut right_src: *const T,
|
||||
mut dst: *mut T,
|
||||
is_less: &mut F,
|
||||
) -> (*const T, *const T, *mut T) {
|
||||
// This is a branchless merge utility function.
|
||||
// The equivalent code with a branch would be:
|
||||
//
|
||||
// if !is_less(&*right_src, &*left_src) {
|
||||
// ptr::copy_nonoverlapping(left_src, dst, 1);
|
||||
// left_src = left_src.add(1);
|
||||
// } else {
|
||||
// ptr::copy_nonoverlapping(right_src, dst, 1);
|
||||
// right_src = right_src.add(1);
|
||||
// }
|
||||
// dst = dst.add(1);
|
||||
|
||||
// SAFETY: The caller must guarantee that `left_src`, `right_src` are valid
|
||||
// to read and `dst` is valid to write, while not aliasing.
|
||||
unsafe {
|
||||
let is_l = !is_less(&*right_src, &*left_src);
|
||||
let src = if is_l { left_src } else { right_src };
|
||||
ptr::copy_nonoverlapping(src, dst, 1);
|
||||
right_src = right_src.add(!is_l as usize);
|
||||
left_src = left_src.add(is_l as usize);
|
||||
dst = dst.add(1);
|
||||
}
|
||||
|
||||
(left_src, right_src, dst)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
unsafe fn merge_down<T, F: FnMut(&T, &T) -> bool>(
|
||||
mut left_src: *const T,
|
||||
mut right_src: *const T,
|
||||
mut dst: *mut T,
|
||||
is_less: &mut F,
|
||||
) -> (*const T, *const T, *mut T) {
|
||||
// This is a branchless merge utility function.
|
||||
// The equivalent code with a branch would be:
|
||||
//
|
||||
// if !is_less(&*right_src, &*left_src) {
|
||||
// ptr::copy_nonoverlapping(right_src, dst, 1);
|
||||
// right_src = right_src.wrapping_sub(1);
|
||||
// } else {
|
||||
// ptr::copy_nonoverlapping(left_src, dst, 1);
|
||||
// left_src = left_src.wrapping_sub(1);
|
||||
// }
|
||||
// dst = dst.sub(1);
|
||||
|
||||
// SAFETY: The caller must guarantee that `left_src`, `right_src` are valid
|
||||
// to read and `dst` is valid to write, while not aliasing.
|
||||
unsafe {
|
||||
let is_l = !is_less(&*right_src, &*left_src);
|
||||
let src = if is_l { right_src } else { left_src };
|
||||
ptr::copy_nonoverlapping(src, dst, 1);
|
||||
right_src = right_src.wrapping_sub(is_l as usize);
|
||||
left_src = left_src.wrapping_sub(!is_l as usize);
|
||||
dst = dst.sub(1);
|
||||
}
|
||||
|
||||
(left_src, right_src, dst)
|
||||
}
|
||||
|
||||
/// Merge v assuming v[..len / 2] and v[len / 2..] are sorted.
|
||||
///
|
||||
/// Original idea for bi-directional merging by Igor van den Hoven (quadsort),
|
||||
/// adapted to only use merge up and down. In contrast to the original
|
||||
/// parity_merge function, it performs 2 writes instead of 4 per iteration.
|
||||
///
|
||||
/// # Safety
|
||||
/// The caller must guarantee that `dst` is valid for v.len() writes.
|
||||
/// Also `v.as_ptr()` and `dst` must not alias and v.len() must be >= 2.
|
||||
///
|
||||
/// Note that T must be Freeze, the comparison function is evaluated on outdated
|
||||
/// temporary 'copies' that may not end up in the final array.
|
||||
unsafe fn bidirectional_merge<T: FreezeMarker, F: FnMut(&T, &T) -> bool>(
|
||||
v: &[T],
|
||||
dst: *mut T,
|
||||
is_less: &mut F,
|
||||
) {
|
||||
// It helps to visualize the merge:
|
||||
//
|
||||
// Initial:
|
||||
//
|
||||
// |dst (in dst)
|
||||
// |left |right
|
||||
// v v
|
||||
// [xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx]
|
||||
// ^ ^
|
||||
// |left_rev |right_rev
|
||||
// |dst_rev (in dst)
|
||||
//
|
||||
// After:
|
||||
//
|
||||
// |dst (in dst)
|
||||
// |left | |right
|
||||
// v v v
|
||||
// [xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx]
|
||||
// ^ ^ ^
|
||||
// |left_rev | |right_rev
|
||||
// |dst_rev (in dst)
|
||||
//
|
||||
// In each iteration one of left or right moves up one position, and one of
|
||||
// left_rev or right_rev moves down one position, whereas dst always moves
|
||||
// up one position and dst_rev always moves down one position. Assuming
|
||||
// the input was sorted and the comparison function is correctly implemented
|
||||
// at the end we will have left == left_rev + 1, and right == right_rev + 1,
|
||||
// fully consuming the input having written it to dst.
|
||||
|
||||
let len = v.len();
|
||||
let src = v.as_ptr();
|
||||
|
||||
let len_div_2 = len / 2;
|
||||
|
||||
// SAFETY: The caller has to ensure that len >= 2.
|
||||
unsafe {
|
||||
intrinsics::assume(len_div_2 != 0); // This can avoid useless code-gen.
|
||||
}
|
||||
|
||||
// SAFETY: no matter what the result of the user-provided comparison function
|
||||
// is, all 4 read pointers will always be in-bounds. Writing `dst` and `dst_rev`
|
||||
// will always be in bounds if the caller guarantees that `dst` is valid for
|
||||
// `v.len()` writes.
|
||||
unsafe {
|
||||
let mut left = src;
|
||||
let mut right = src.add(len_div_2);
|
||||
let mut dst = dst;
|
||||
|
||||
let mut left_rev = src.add(len_div_2 - 1);
|
||||
let mut right_rev = src.add(len - 1);
|
||||
let mut dst_rev = dst.add(len - 1);
|
||||
|
||||
for _ in 0..len_div_2 {
|
||||
(left, right, dst) = merge_up(left, right, dst, is_less);
|
||||
(left_rev, right_rev, dst_rev) = merge_down(left_rev, right_rev, dst_rev, is_less);
|
||||
}
|
||||
|
||||
let left_end = left_rev.wrapping_add(1);
|
||||
let right_end = right_rev.wrapping_add(1);
|
||||
|
||||
// Odd length, so one element is left unconsumed in the input.
|
||||
if len % 2 != 0 {
|
||||
let left_nonempty = left < left_end;
|
||||
let last_src = if left_nonempty { left } else { right };
|
||||
ptr::copy_nonoverlapping(last_src, dst, 1);
|
||||
left = left.add(left_nonempty as usize);
|
||||
right = right.add((!left_nonempty) as usize);
|
||||
}
|
||||
|
||||
// We now should have consumed the full input exactly once. This can
|
||||
// only fail if the comparison operator fails to be Ord, in which case
|
||||
// we will panic and never access the inconsistent state in dst.
|
||||
if left != left_end || right != right_end {
|
||||
panic_on_ord_violation();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(never)]
|
||||
fn panic_on_ord_violation() -> ! {
|
||||
panic!("Ord violation");
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub(crate) const fn has_efficient_in_place_swap<T>() -> bool {
|
||||
// Heuristic that holds true on all tested 64-bit capable architectures.
|
||||
mem::size_of::<T>() <= 8 // mem::size_of::<u64>()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn type_info() {
|
||||
assert!(has_efficient_in_place_swap::<i32>());
|
||||
assert!(has_efficient_in_place_swap::<u64>());
|
||||
assert!(!has_efficient_in_place_swap::<u128>());
|
||||
assert!(!has_efficient_in_place_swap::<String>());
|
||||
}
|
||||
|
||||
/// SAFETY: Only used for run-time optimization heuristic.
|
||||
#[rustc_unsafe_specialization_marker]
|
||||
trait CopyMarker {}
|
||||
|
||||
impl<T: Copy> CopyMarker for T {}
|
||||
|
||||
#[const_trait]
|
||||
trait IsCopy {
|
||||
fn is_copy() -> bool;
|
||||
}
|
||||
|
||||
impl<T> const IsCopy for T {
|
||||
default fn is_copy() -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
impl<T: CopyMarker> const IsCopy for T {
|
||||
fn is_copy() -> bool {
|
||||
true
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,300 @@
|
||||
//! This module contains the hybrid top-level loop combining bottom-up Mergesort with top-down
|
||||
//! Quicksort.
|
||||
|
||||
use crate::cmp;
|
||||
use crate::intrinsics;
|
||||
use crate::mem::MaybeUninit;
|
||||
|
||||
use crate::slice::sort::shared::find_existing_run;
|
||||
use crate::slice::sort::shared::smallsort::StableSmallSortTypeImpl;
|
||||
use crate::slice::sort::stable::merge::merge;
|
||||
use crate::slice::sort::stable::quicksort::quicksort;
|
||||
|
||||
/// Sorts `v` based on comparison function `is_less`. If `eager_sort` is true,
|
||||
/// it will only do small-sorts and physical merges, ensuring O(N * log(N))
|
||||
/// worst-case complexity. `scratch.len()` must be at least `max(v.len() / 2,
|
||||
/// MIN_SMALL_SORT_SCRATCH_LEN)` otherwise the implementation may abort.
|
||||
/// Fully ascending and descending inputs will be sorted with exactly N - 1
|
||||
/// comparisons.
|
||||
///
|
||||
/// This is the main loop for driftsort, which uses powersort's heuristic to
|
||||
/// determine in which order to merge runs, see below for details.
|
||||
pub fn sort<T, F: FnMut(&T, &T) -> bool>(
|
||||
v: &mut [T],
|
||||
scratch: &mut [MaybeUninit<T>],
|
||||
eager_sort: bool,
|
||||
is_less: &mut F,
|
||||
) {
|
||||
let len = v.len();
|
||||
if len < 2 {
|
||||
return; // Removing this length check *increases* code size.
|
||||
}
|
||||
let scale_factor = merge_tree_scale_factor(len);
|
||||
|
||||
// It's important to have a relatively high entry barrier for pre-sorted
|
||||
// runs, as the presence of a single such run will force on average several
|
||||
// merge operations and shrink the maximum quicksort size a lot. For that
|
||||
// reason we use sqrt(len) as our pre-sorted run threshold.
|
||||
const MIN_SQRT_RUN_LEN: usize = 64;
|
||||
let min_good_run_len = if len <= (MIN_SQRT_RUN_LEN * MIN_SQRT_RUN_LEN) {
|
||||
// For small input length `MIN_SQRT_RUN_LEN` would break pattern
|
||||
// detection of full or nearly sorted inputs.
|
||||
cmp::min(len - len / 2, MIN_SQRT_RUN_LEN)
|
||||
} else {
|
||||
sqrt_approx(len)
|
||||
};
|
||||
|
||||
// (stack_len, runs, desired_depths) together form a stack maintaining run
|
||||
// information for the powersort heuristic. desired_depths[i] is the desired
|
||||
// depth of the merge node that merges runs[i] with the run that comes after
|
||||
// it.
|
||||
let mut stack_len = 0;
|
||||
let mut run_storage = MaybeUninit::<[DriftsortRun; 66]>::uninit();
|
||||
let runs: *mut DriftsortRun = run_storage.as_mut_ptr().cast();
|
||||
let mut desired_depth_storage = MaybeUninit::<[u8; 66]>::uninit();
|
||||
let desired_depths: *mut u8 = desired_depth_storage.as_mut_ptr().cast();
|
||||
|
||||
let mut scan_idx = 0;
|
||||
let mut prev_run = DriftsortRun::new_sorted(0); // Initial dummy run.
|
||||
loop {
|
||||
// Compute the next run and the desired depth of the merge node between
|
||||
// prev_run and next_run. On the last iteration we create a dummy run
|
||||
// with root-level desired depth to fully collapse the merge tree.
|
||||
let (next_run, desired_depth);
|
||||
if scan_idx < len {
|
||||
next_run =
|
||||
create_run(&mut v[scan_idx..], scratch, min_good_run_len, eager_sort, is_less);
|
||||
desired_depth = merge_tree_depth(
|
||||
scan_idx - prev_run.len(),
|
||||
scan_idx,
|
||||
scan_idx + next_run.len(),
|
||||
scale_factor,
|
||||
);
|
||||
} else {
|
||||
next_run = DriftsortRun::new_sorted(0);
|
||||
desired_depth = 0;
|
||||
};
|
||||
|
||||
// Process the merge nodes between earlier runs[i] that have a desire to
|
||||
// be deeper in the merge tree than the merge node for the splitpoint
|
||||
// between prev_run and next_run.
|
||||
//
|
||||
// SAFETY: first note that this is the only place we modify stack_len,
|
||||
// runs or desired depths. We maintain the following invariants:
|
||||
// 1. The first stack_len elements of runs/desired_depths are initialized.
|
||||
// 2. For all valid i > 0, desired_depths[i] < desired_depths[i+1].
|
||||
// 3. The sum of all valid runs[i].len() plus prev_run.len() equals
|
||||
// scan_idx.
|
||||
unsafe {
|
||||
while stack_len > 1 && *desired_depths.add(stack_len - 1) >= desired_depth {
|
||||
// Desired depth greater than the upcoming desired depth, pop
|
||||
// left neighbor run from stack and merge into prev_run.
|
||||
let left = *runs.add(stack_len - 1);
|
||||
let merged_len = left.len() + prev_run.len();
|
||||
let merge_start_idx = scan_idx - merged_len;
|
||||
let merge_slice = v.get_unchecked_mut(merge_start_idx..scan_idx);
|
||||
prev_run = logical_merge(merge_slice, scratch, left, prev_run, is_less);
|
||||
stack_len -= 1;
|
||||
}
|
||||
|
||||
// We now know that desired_depths[stack_len - 1] < desired_depth,
|
||||
// maintaining our invariant. This also guarantees we don't overflow
|
||||
// the stack as merge_tree_depth(..) <= 64 and thus we can only have
|
||||
// 64 distinct values on the stack before pushing, plus our initial
|
||||
// dummy run, while our capacity is 66.
|
||||
*runs.add(stack_len) = prev_run;
|
||||
*desired_depths.add(stack_len) = desired_depth;
|
||||
stack_len += 1;
|
||||
}
|
||||
|
||||
// Break before overriding the last run with our dummy run.
|
||||
if scan_idx >= len {
|
||||
break;
|
||||
}
|
||||
|
||||
scan_idx += next_run.len();
|
||||
prev_run = next_run;
|
||||
}
|
||||
|
||||
if !prev_run.sorted() {
|
||||
stable_quicksort(v, scratch, is_less);
|
||||
}
|
||||
}
|
||||
|
||||
// Nearly-Optimal Mergesorts: Fast, Practical Sorting Methods That Optimally
|
||||
// Adapt to Existing Runs by J. Ian Munro and Sebastian Wild.
|
||||
//
|
||||
// This method forms a binary merge tree, where each internal node corresponds
|
||||
// to a splitting point between the adjacent runs that have to be merged. If we
|
||||
// visualize our array as the number line from 0 to 1, we want to find the
|
||||
// dyadic fraction with smallest denominator that lies between the midpoints of
|
||||
// our to-be-merged slices. The exponent in the dyadic fraction indicates the
|
||||
// desired depth in the binary merge tree this internal node wishes to have.
|
||||
// This does not always correspond to the actual depth due to the inherent
|
||||
// imbalance in runs, but we follow it as closely as possible.
|
||||
//
|
||||
// As an optimization we rescale the number line from [0, 1) to [0, 2^62). Then
|
||||
// finding the simplest dyadic fraction between midpoints corresponds to finding
|
||||
// the most significant bit difference of the midpoints. We save scale_factor =
|
||||
// ceil(2^62 / n) to perform this rescaling using a multiplication, avoiding
|
||||
// having to repeatedly do integer divides. This rescaling isn't exact when n is
|
||||
// not a power of two since we use integers and not reals, but the result is
|
||||
// very close, and in fact when n < 2^30 the resulting tree is equivalent as the
|
||||
// approximation errors stay entirely in the lower order bits.
|
||||
//
|
||||
// Thus for the splitting point between two adjacent slices [a, b) and [b, c)
|
||||
// the desired depth of the corresponding merge node is CLZ((a+b)*f ^ (b+c)*f),
|
||||
// where CLZ counts the number of leading zeros in an integer and f is our scale
|
||||
// factor. Note that we omitted the division by two in the midpoint
|
||||
// calculations, as this simply shifts the bits by one position (and thus always
|
||||
// adds one to the result), and we only care about the relative depths.
|
||||
//
|
||||
// Finally, if we try to upper bound x = (a+b)*f giving x = (n-1 + n) * ceil(2^62 / n) then
|
||||
// x < (2^62 / n + 1) * 2n
|
||||
// x < 2^63 + 2n
|
||||
// So as long as n < 2^62 we find that x < 2^64, meaning our operations do not
|
||||
// overflow.
|
||||
#[inline(always)]
|
||||
fn merge_tree_scale_factor(n: usize) -> u64 {
|
||||
if usize::BITS > u64::BITS {
|
||||
panic!("Platform not supported");
|
||||
}
|
||||
|
||||
((1 << 62) + n as u64 - 1) / n as u64
|
||||
}
|
||||
|
||||
// Note: merge_tree_depth output is < 64 when left < right as f*x and f*y must
|
||||
// differ in some bit, and is <= 64 always.
|
||||
#[inline(always)]
|
||||
fn merge_tree_depth(left: usize, mid: usize, right: usize, scale_factor: u64) -> u8 {
|
||||
let x = left as u64 + mid as u64;
|
||||
let y = mid as u64 + right as u64;
|
||||
((scale_factor * x) ^ (scale_factor * y)).leading_zeros() as u8
|
||||
}
|
||||
|
||||
fn sqrt_approx(n: usize) -> usize {
|
||||
// Note that sqrt(n) = n^(1/2), and that 2^log2(n) = n. We combine these
|
||||
// two facts to approximate sqrt(n) as 2^(log2(n) / 2). Because our integer
|
||||
// log floors we want to add 0.5 to compensate for this on average, so our
|
||||
// initial approximation is 2^((1 + floor(log2(n))) / 2).
|
||||
//
|
||||
// We then apply an iteration of Newton's method to improve our
|
||||
// approximation, which for sqrt(n) is a1 = (a0 + n / a0) / 2.
|
||||
//
|
||||
// Finally we note that the exponentiation / division can be done directly
|
||||
// with shifts. We OR with 1 to avoid zero-checks in the integer log.
|
||||
let ilog = (n | 1).ilog2();
|
||||
let shift = (1 + ilog) / 2;
|
||||
((1 << shift) + (n >> shift)) / 2
|
||||
}
|
||||
|
||||
// Lazy logical runs as in Glidesort.
|
||||
#[inline(always)]
|
||||
fn logical_merge<T, F: FnMut(&T, &T) -> bool>(
|
||||
v: &mut [T],
|
||||
scratch: &mut [MaybeUninit<T>],
|
||||
left: DriftsortRun,
|
||||
right: DriftsortRun,
|
||||
is_less: &mut F,
|
||||
) -> DriftsortRun {
|
||||
// If one or both of the runs are sorted do a physical merge, using
|
||||
// quicksort to sort the unsorted run if present. We also *need* to
|
||||
// physically merge if the combined runs would not fit in the scratch space
|
||||
// anymore (as this would mean we are no longer able to to quicksort them).
|
||||
let len = v.len();
|
||||
let can_fit_in_scratch = len <= scratch.len();
|
||||
if !can_fit_in_scratch || left.sorted() || right.sorted() {
|
||||
if !left.sorted() {
|
||||
stable_quicksort(&mut v[..left.len()], scratch, is_less);
|
||||
}
|
||||
if !right.sorted() {
|
||||
stable_quicksort(&mut v[left.len()..], scratch, is_less);
|
||||
}
|
||||
merge(v, scratch, left.len(), is_less);
|
||||
|
||||
DriftsortRun::new_sorted(len)
|
||||
} else {
|
||||
DriftsortRun::new_unsorted(len)
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a new logical run.
|
||||
///
|
||||
/// A logical run can either be sorted or unsorted. If there is a pre-existing
|
||||
/// run that clears the `min_good_run_len` threshold it is returned as a sorted
|
||||
/// run. If not, the result depends on the value of `eager_sort`. If it is true,
|
||||
/// then a sorted run of length `T::SMALL_SORT_THRESHOLD` is returned, and if it
|
||||
/// is false an unsorted run of length `min_good_run_len` is returned.
|
||||
fn create_run<T, F: FnMut(&T, &T) -> bool>(
|
||||
v: &mut [T],
|
||||
scratch: &mut [MaybeUninit<T>],
|
||||
min_good_run_len: usize,
|
||||
eager_sort: bool,
|
||||
is_less: &mut F,
|
||||
) -> DriftsortRun {
|
||||
let len = v.len();
|
||||
if len >= min_good_run_len {
|
||||
let (run_len, was_reversed) = find_existing_run(v, is_less);
|
||||
|
||||
// SAFETY: find_existing_run promises to return a valid run_len.
|
||||
unsafe { intrinsics::assume(run_len <= len) };
|
||||
|
||||
if run_len >= min_good_run_len {
|
||||
if was_reversed {
|
||||
v[..run_len].reverse();
|
||||
}
|
||||
|
||||
return DriftsortRun::new_sorted(run_len);
|
||||
}
|
||||
}
|
||||
|
||||
if eager_sort {
|
||||
// We call quicksort with a len that will immediately call small-sort.
|
||||
// By not calling the small-sort directly here it can always be inlined into
|
||||
// the quicksort itself, making the recursive base case faster and is generally
|
||||
// more binary-size efficient.
|
||||
let eager_run_len = cmp::min(T::small_sort_threshold(), len);
|
||||
quicksort(&mut v[..eager_run_len], scratch, 0, None, is_less);
|
||||
DriftsortRun::new_sorted(eager_run_len)
|
||||
} else {
|
||||
DriftsortRun::new_unsorted(cmp::min(min_good_run_len, len))
|
||||
}
|
||||
}
|
||||
|
||||
fn stable_quicksort<T, F: FnMut(&T, &T) -> bool>(
|
||||
v: &mut [T],
|
||||
scratch: &mut [MaybeUninit<T>],
|
||||
is_less: &mut F,
|
||||
) {
|
||||
// Limit the number of imbalanced partitions to `2 * floor(log2(len))`.
|
||||
// The binary OR by one is used to eliminate the zero-check in the logarithm.
|
||||
let limit = 2 * (v.len() | 1).ilog2();
|
||||
quicksort(v, scratch, limit, None, is_less);
|
||||
}
|
||||
|
||||
/// Compactly stores the length of a run, and whether or not it is sorted. This
|
||||
/// can always fit in a usize because the maximum slice length is isize::MAX.
|
||||
#[derive(Copy, Clone)]
|
||||
struct DriftsortRun(usize);
|
||||
|
||||
impl DriftsortRun {
|
||||
#[inline(always)]
|
||||
fn new_sorted(length: usize) -> Self {
|
||||
Self((length << 1) | 1)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn new_unsorted(length: usize) -> Self {
|
||||
Self(length << 1)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn sorted(self) -> bool {
|
||||
self.0 & 1 == 1
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn len(self) -> usize {
|
||||
self.0 >> 1
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,151 @@
|
||||
//! This module contains logic for performing a merge of two sorted sub-slices.
|
||||
|
||||
use crate::cmp;
|
||||
use crate::mem::MaybeUninit;
|
||||
use crate::ptr;
|
||||
|
||||
/// Merges non-decreasing runs `v[..mid]` and `v[mid..]` using `scratch` as
|
||||
/// temporary storage, and stores the result into `v[..]`.
|
||||
pub fn merge<T, F: FnMut(&T, &T) -> bool>(
|
||||
v: &mut [T],
|
||||
scratch: &mut [MaybeUninit<T>],
|
||||
mid: usize,
|
||||
is_less: &mut F,
|
||||
) {
|
||||
let len = v.len();
|
||||
|
||||
if mid == 0 || mid >= len || scratch.len() < cmp::min(mid, len - mid) {
|
||||
return;
|
||||
}
|
||||
|
||||
// SAFETY: We checked that the two slices are non-empty and `mid` is in-bounds.
|
||||
// We checked that the buffer `scratch` has enough capacity to hold a copy of
|
||||
// the shorter slice. `merge_up` and `merge_down` are written in such a way that
|
||||
// they uphold the contract described in `MergeState::drop`.
|
||||
unsafe {
|
||||
// The merge process first copies the shorter run into `buf`. Then it traces
|
||||
// the newly copied run and the longer run forwards (or backwards), comparing
|
||||
// their next unconsumed elements and copying the lesser (or greater) one into `v`.
|
||||
//
|
||||
// As soon as the shorter run is fully consumed, the process is done. If the
|
||||
// longer run gets consumed first, then we must copy whatever is left of the
|
||||
// shorter run into the remaining gap in `v`.
|
||||
//
|
||||
// Intermediate state of the process is always tracked by `gap`, which serves
|
||||
// two purposes:
|
||||
// 1. Protects integrity of `v` from panics in `is_less`.
|
||||
// 2. Fills the remaining gap in `v` if the longer run gets consumed first.
|
||||
|
||||
let buf = MaybeUninit::slice_as_mut_ptr(scratch);
|
||||
|
||||
let v_base = v.as_mut_ptr();
|
||||
let v_mid = v_base.add(mid);
|
||||
let v_end = v_base.add(len);
|
||||
|
||||
let left_len = mid;
|
||||
let right_len = len - mid;
|
||||
|
||||
let left_is_shorter = left_len <= right_len;
|
||||
let save_base = if left_is_shorter { v_base } else { v_mid };
|
||||
let save_len = if left_is_shorter { left_len } else { right_len };
|
||||
|
||||
ptr::copy_nonoverlapping(save_base, buf, save_len);
|
||||
|
||||
let mut merge_state = MergeState { start: buf, end: buf.add(save_len), dst: save_base };
|
||||
|
||||
if left_is_shorter {
|
||||
merge_state.merge_up(v_mid, v_end, is_less);
|
||||
} else {
|
||||
merge_state.merge_down(v_base, buf, v_end, is_less);
|
||||
}
|
||||
// Finally, `merge_state` gets dropped. If the shorter run was not fully
|
||||
// consumed, whatever remains of it will now be copied into the hole in `v`.
|
||||
}
|
||||
|
||||
// When dropped, copies the range `start..end` into `dst..`.
|
||||
struct MergeState<T> {
|
||||
start: *mut T,
|
||||
end: *mut T,
|
||||
dst: *mut T,
|
||||
}
|
||||
|
||||
impl<T> MergeState<T> {
|
||||
/// # Safety
|
||||
/// The caller MUST guarantee that `self` is initialized in a way where `start -> end` is
|
||||
/// the longer sub-slice and so that `dst` can be written to at least the shorter sub-slice
|
||||
/// length times. In addition `start -> end` and `right -> right_end` MUST be valid to be
|
||||
/// read. This function MUST only be called once.
|
||||
unsafe fn merge_up<F: FnMut(&T, &T) -> bool>(
|
||||
&mut self,
|
||||
mut right: *const T,
|
||||
right_end: *const T,
|
||||
is_less: &mut F,
|
||||
) {
|
||||
// SAFETY: See function safety comment.
|
||||
unsafe {
|
||||
let left = &mut self.start;
|
||||
let out = &mut self.dst;
|
||||
|
||||
while *left != self.end && right as *const T != right_end {
|
||||
let consume_left = !is_less(&*right, &**left);
|
||||
|
||||
let src = if consume_left { *left } else { right };
|
||||
ptr::copy_nonoverlapping(src, *out, 1);
|
||||
|
||||
*left = left.add(consume_left as usize);
|
||||
right = right.add(!consume_left as usize);
|
||||
|
||||
*out = out.add(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// # Safety
|
||||
/// The caller MUST guarantee that `self` is initialized in a way where `left_end <- dst` is
|
||||
/// the shorter sub-slice and so that `out` can be written to at least the shorter sub-slice
|
||||
/// length times. In addition `left_end <- dst` and `right_end <- end` MUST be valid to be
|
||||
/// read. This function MUST only be called once.
|
||||
unsafe fn merge_down<F: FnMut(&T, &T) -> bool>(
|
||||
&mut self,
|
||||
left_end: *const T,
|
||||
right_end: *const T,
|
||||
mut out: *mut T,
|
||||
is_less: &mut F,
|
||||
) {
|
||||
// SAFETY: See function safety comment.
|
||||
unsafe {
|
||||
loop {
|
||||
let left = self.dst.sub(1);
|
||||
let right = self.end.sub(1);
|
||||
out = out.sub(1);
|
||||
|
||||
let consume_left = is_less(&*right, &*left);
|
||||
|
||||
let src = if consume_left { left } else { right };
|
||||
ptr::copy_nonoverlapping(src, out, 1);
|
||||
|
||||
self.dst = left.add(!consume_left as usize);
|
||||
self.end = right.add(consume_left as usize);
|
||||
|
||||
if self.dst as *const T == left_end || self.end as *const T == right_end {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Drop for MergeState<T> {
|
||||
fn drop(&mut self) {
|
||||
// SAFETY: The user of MergeState MUST ensure, that at any point this drop
|
||||
// impl MAY run, for example when the user provided `is_less` panics, that
|
||||
// copying the contiguous region between `start` and `end` to `dst` will
|
||||
// leave the input slice `v` with each original element and all possible
|
||||
// modifications observed.
|
||||
unsafe {
|
||||
let len = self.end.sub_ptr(self.start);
|
||||
ptr::copy_nonoverlapping(self.start, self.dst, len);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,116 @@
|
||||
//! This module contains the entry points for `slice::sort`.
|
||||
|
||||
use crate::cmp;
|
||||
use crate::intrinsics;
|
||||
use crate::mem::{self, MaybeUninit, SizedTypeProperties};
|
||||
|
||||
use crate::slice::sort::shared::smallsort::{
|
||||
insertion_sort_shift_left, StableSmallSortTypeImpl, SMALL_SORT_GENERAL_SCRATCH_LEN,
|
||||
};
|
||||
|
||||
pub(crate) mod drift;
|
||||
pub(crate) mod merge;
|
||||
pub(crate) mod quicksort;
|
||||
|
||||
/// Stable sort called driftsort by Orson Peters and Lukas Bergdoll.
|
||||
/// Design document:
|
||||
/// https://github.com/Voultapher/sort-research-rs/blob/main/writeup/driftsort_introduction/text.md
|
||||
///
|
||||
/// Upholds all safety properties outlined here:
|
||||
/// https://github.com/Voultapher/sort-research-rs/blob/main/writeup/sort_safety/text.md
|
||||
#[inline(always)]
|
||||
pub fn sort<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], is_less: &mut F) {
|
||||
// Arrays of zero-sized types are always all-equal, and thus sorted.
|
||||
if T::IS_ZST {
|
||||
return;
|
||||
}
|
||||
|
||||
// Instrumenting the standard library showed that 90+% of the calls to sort
|
||||
// by rustc are either of size 0 or 1.
|
||||
let len = v.len();
|
||||
if intrinsics::likely(len < 2) {
|
||||
return;
|
||||
}
|
||||
|
||||
// More advanced sorting methods than insertion sort are faster if called in
|
||||
// a hot loop for small inputs, but for general-purpose code the small
|
||||
// binary size of insertion sort is more important. The instruction cache in
|
||||
// modern processors is very valuable, and for a single sort call in general
|
||||
// purpose code any gains from an advanced method are cancelled by i-cache
|
||||
// misses during the sort, and thrashing the i-cache for surrounding code.
|
||||
const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
|
||||
if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
|
||||
insertion_sort_shift_left(v, 1, is_less);
|
||||
return;
|
||||
}
|
||||
|
||||
driftsort_main::<T, F, BufT>(v, is_less);
|
||||
}
|
||||
|
||||
/// See [`sort`]
|
||||
///
|
||||
/// Deliberately don't inline the main sorting routine entrypoint to ensure the
|
||||
/// inlined insertion sort i-cache footprint remains minimal.
|
||||
#[inline(never)]
|
||||
fn driftsort_main<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], is_less: &mut F) {
|
||||
// By allocating n elements of memory we can ensure the entire input can
|
||||
// be sorted using stable quicksort, which allows better performance on
|
||||
// random and low-cardinality distributions. However, we still want to
|
||||
// reduce our memory usage to n / 2 for large inputs. We do this by scaling
|
||||
// our allocation as max(n / 2, min(n, 8MB)), ensuring we scale like n for
|
||||
// small inputs and n / 2 for large inputs, without a sudden drop off. We
|
||||
// also need to ensure our alloc >= MIN_SMALL_SORT_SCRATCH_LEN, as the
|
||||
// small-sort always needs this much memory.
|
||||
const MAX_FULL_ALLOC_BYTES: usize = 8_000_000; // 8MB
|
||||
let max_full_alloc = MAX_FULL_ALLOC_BYTES / mem::size_of::<T>();
|
||||
let len = v.len();
|
||||
let alloc_len =
|
||||
cmp::max(cmp::max(len / 2, cmp::min(len, max_full_alloc)), SMALL_SORT_GENERAL_SCRATCH_LEN);
|
||||
|
||||
// For small inputs 4KiB of stack storage suffices, which allows us to avoid
|
||||
// calling the (de-)allocator. Benchmarks showed this was quite beneficial.
|
||||
let mut stack_buf = AlignedStorage::<T, 4096>::new();
|
||||
let stack_scratch = stack_buf.as_uninit_slice_mut();
|
||||
let mut heap_buf;
|
||||
let scratch = if stack_scratch.len() >= alloc_len {
|
||||
stack_scratch
|
||||
} else {
|
||||
heap_buf = BufT::with_capacity(alloc_len);
|
||||
heap_buf.as_uninit_slice_mut()
|
||||
};
|
||||
|
||||
// For small inputs using quicksort is not yet beneficial, and a single
|
||||
// small-sort or two small-sorts plus a single merge outperforms it, so use
|
||||
// eager mode.
|
||||
let eager_sort = len <= T::small_sort_threshold() * 2;
|
||||
crate::slice::sort::stable::drift::sort(v, scratch, eager_sort, is_less);
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
/// Abstracts owned memory buffer, so that sort code can live in core where no allocation is
|
||||
/// possible. This trait can then be implemented in a place that has access to allocation.
|
||||
pub trait BufGuard<T> {
|
||||
/// Creates new buffer that holds at least `capacity` memory.
|
||||
fn with_capacity(capacity: usize) -> Self;
|
||||
/// Returns mutable access to uninitialized memory owned by the buffer.
|
||||
fn as_uninit_slice_mut(&mut self) -> &mut [MaybeUninit<T>];
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
struct AlignedStorage<T, const N: usize> {
|
||||
_align: [T; 0],
|
||||
storage: [MaybeUninit<u8>; N],
|
||||
}
|
||||
|
||||
impl<T, const N: usize> AlignedStorage<T, N> {
|
||||
fn new() -> Self {
|
||||
Self { _align: [], storage: MaybeUninit::uninit_array() }
|
||||
}
|
||||
|
||||
fn as_uninit_slice_mut(&mut self) -> &mut [MaybeUninit<T>] {
|
||||
let len = N / mem::size_of::<T>();
|
||||
|
||||
// SAFETY: `_align` ensures we are correctly aligned.
|
||||
unsafe { core::slice::from_raw_parts_mut(self.storage.as_mut_ptr().cast(), len) }
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,267 @@
|
||||
//! This module contains a stable quicksort and partition implementation.
|
||||
|
||||
use crate::intrinsics;
|
||||
use crate::mem::{self, ManuallyDrop, MaybeUninit};
|
||||
use crate::ptr;
|
||||
|
||||
use crate::slice::sort::shared::pivot::choose_pivot;
|
||||
use crate::slice::sort::shared::smallsort::StableSmallSortTypeImpl;
|
||||
use crate::slice::sort::shared::FreezeMarker;
|
||||
|
||||
/// Sorts `v` recursively using quicksort.
|
||||
///
|
||||
/// `limit` when initialized with `c*log(v.len())` for some c ensures we do not
|
||||
/// overflow the stack or go quadratic.
|
||||
#[inline(never)]
|
||||
pub fn quicksort<T, F: FnMut(&T, &T) -> bool>(
|
||||
mut v: &mut [T],
|
||||
scratch: &mut [MaybeUninit<T>],
|
||||
mut limit: u32,
|
||||
mut left_ancestor_pivot: Option<&T>,
|
||||
is_less: &mut F,
|
||||
) {
|
||||
loop {
|
||||
let len = v.len();
|
||||
|
||||
if len <= T::small_sort_threshold() {
|
||||
T::small_sort(v, scratch, is_less);
|
||||
return;
|
||||
}
|
||||
|
||||
if limit == 0 {
|
||||
// We have had too many bad pivots, switch to O(n log n) fallback
|
||||
// algorithm. In our case that is driftsort in eager mode.
|
||||
crate::slice::sort::stable::drift::sort(v, scratch, true, is_less);
|
||||
return;
|
||||
}
|
||||
limit -= 1;
|
||||
|
||||
let pivot_pos = choose_pivot(v, is_less);
|
||||
// SAFETY: choose_pivot promises to return a valid pivot index.
|
||||
unsafe {
|
||||
intrinsics::assume(pivot_pos < v.len());
|
||||
}
|
||||
|
||||
// SAFETY: We only access the temporary copy for Freeze types, otherwise
|
||||
// self-modifications via `is_less` would not be observed and this would
|
||||
// be unsound. Our temporary copy does not escape this scope.
|
||||
let pivot_copy = unsafe { ManuallyDrop::new(ptr::read(&v[pivot_pos])) };
|
||||
let pivot_ref = (!has_direct_interior_mutability::<T>()).then_some(&*pivot_copy);
|
||||
|
||||
// We choose a pivot, and check if this pivot is equal to our left
|
||||
// ancestor. If true, we do a partition putting equal elements on the
|
||||
// left and do not recurse on it. This gives O(n log k) sorting for k
|
||||
// distinct values, a strategy borrowed from pdqsort. For types with
|
||||
// interior mutability we can't soundly create a temporary copy of the
|
||||
// ancestor pivot, and use left_partition_len == 0 as our method for
|
||||
// detecting when we re-use a pivot, which means we do at most three
|
||||
// partition operations with pivot p instead of the optimal two.
|
||||
let mut perform_equal_partition = false;
|
||||
if let Some(la_pivot) = left_ancestor_pivot {
|
||||
perform_equal_partition = !is_less(la_pivot, &v[pivot_pos]);
|
||||
}
|
||||
|
||||
let mut left_partition_len = 0;
|
||||
if !perform_equal_partition {
|
||||
left_partition_len = stable_partition(v, scratch, pivot_pos, false, is_less);
|
||||
perform_equal_partition = left_partition_len == 0;
|
||||
}
|
||||
|
||||
if perform_equal_partition {
|
||||
let mid_eq = stable_partition(v, scratch, pivot_pos, true, &mut |a, b| !is_less(b, a));
|
||||
v = &mut v[mid_eq..];
|
||||
left_ancestor_pivot = None;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Process left side with the next loop iter, right side with recursion.
|
||||
let (left, right) = v.split_at_mut(left_partition_len);
|
||||
quicksort(right, scratch, limit, pivot_ref, is_less);
|
||||
v = left;
|
||||
}
|
||||
}
|
||||
|
||||
/// Partitions `v` using pivot `p = v[pivot_pos]` and returns the number of
|
||||
/// elements less than `p`. The relative order of elements that compare < p and
|
||||
/// those that compare >= p is preserved - it is a stable partition.
|
||||
///
|
||||
/// If `is_less` is not a strict total order or panics, `scratch.len() < v.len()`,
|
||||
/// or `pivot_pos >= v.len()`, the result and `v`'s state is sound but unspecified.
|
||||
fn stable_partition<T, F: FnMut(&T, &T) -> bool>(
|
||||
v: &mut [T],
|
||||
scratch: &mut [MaybeUninit<T>],
|
||||
pivot_pos: usize,
|
||||
pivot_goes_left: bool,
|
||||
is_less: &mut F,
|
||||
) -> usize {
|
||||
let len = v.len();
|
||||
|
||||
if intrinsics::unlikely(scratch.len() < len || pivot_pos >= len) {
|
||||
core::intrinsics::abort()
|
||||
}
|
||||
|
||||
let v_base = v.as_ptr();
|
||||
let scratch_base = MaybeUninit::slice_as_mut_ptr(scratch);
|
||||
|
||||
// The core idea is to write the values that compare as less-than to the left
|
||||
// side of `scratch`, while the values that compared as greater or equal than
|
||||
// `v[pivot_pos]` go to the right side of `scratch` in reverse. See
|
||||
// PartitionState for details.
|
||||
|
||||
// SAFETY: see individual comments.
|
||||
unsafe {
|
||||
// SAFETY: we made sure the scratch has length >= len and that pivot_pos
|
||||
// is in-bounds. v and scratch are disjoint slices.
|
||||
let pivot = v_base.add(pivot_pos);
|
||||
let mut state = PartitionState::new(v_base, scratch_base, len);
|
||||
|
||||
let mut pivot_in_scratch = ptr::null_mut();
|
||||
let mut loop_end_pos = pivot_pos;
|
||||
|
||||
// SAFETY: this loop is equivalent to calling state.partition_one
|
||||
// exactly len times.
|
||||
loop {
|
||||
// Ideally the outer loop won't be unrolled, to save binary size,
|
||||
// but we do want the inner loop to be unrolled for small types, as
|
||||
// this gave significant performance boosts in benchmarks. Unrolling
|
||||
// through for _ in 0..UNROLL_LEN { .. } instead of manually improves
|
||||
// compile times but has a ~10-20% performance penalty on opt-level=s.
|
||||
if const { mem::size_of::<T>() <= 16 } {
|
||||
const UNROLL_LEN: usize = 4;
|
||||
let unroll_end = v_base.add(loop_end_pos.saturating_sub(UNROLL_LEN - 1));
|
||||
while state.scan < unroll_end {
|
||||
state.partition_one(is_less(&*state.scan, &*pivot));
|
||||
state.partition_one(is_less(&*state.scan, &*pivot));
|
||||
state.partition_one(is_less(&*state.scan, &*pivot));
|
||||
state.partition_one(is_less(&*state.scan, &*pivot));
|
||||
}
|
||||
}
|
||||
|
||||
let loop_end = v_base.add(loop_end_pos);
|
||||
while state.scan < loop_end {
|
||||
state.partition_one(is_less(&*state.scan, &*pivot));
|
||||
}
|
||||
|
||||
if loop_end_pos == len {
|
||||
break;
|
||||
}
|
||||
|
||||
// We avoid comparing pivot with itself, as this could create deadlocks for
|
||||
// certain comparison operators. We also store its location later for later.
|
||||
pivot_in_scratch = state.partition_one(pivot_goes_left);
|
||||
|
||||
loop_end_pos = len;
|
||||
}
|
||||
|
||||
// `pivot` must be copied into its correct position again, because a
|
||||
// comparison operator might have modified it.
|
||||
if has_direct_interior_mutability::<T>() {
|
||||
ptr::copy_nonoverlapping(pivot, pivot_in_scratch, 1);
|
||||
}
|
||||
|
||||
// SAFETY: partition_one being called exactly len times guarantees that scratch
|
||||
// is initialized with a permuted copy of `v`, and that num_left <= v.len().
|
||||
// Copying scratch[0..num_left] and scratch[num_left..v.len()] back is thus
|
||||
// sound, as the values in scratch will never be read again, meaning our copies
|
||||
// semantically act as moves, permuting `v`.
|
||||
|
||||
// Copy all the elements < p directly from swap to v.
|
||||
let v_base = v.as_mut_ptr();
|
||||
ptr::copy_nonoverlapping(scratch_base, v_base, state.num_left);
|
||||
|
||||
// Copy the elements >= p in reverse order.
|
||||
for i in 0..len - state.num_left {
|
||||
ptr::copy_nonoverlapping(
|
||||
scratch_base.add(len - 1 - i),
|
||||
v_base.add(state.num_left + i),
|
||||
1,
|
||||
);
|
||||
}
|
||||
|
||||
state.num_left
|
||||
}
|
||||
}
|
||||
|
||||
struct PartitionState<T> {
|
||||
// The start of the scratch auxiliary memory.
|
||||
scratch_base: *mut T,
|
||||
// The current element that is being looked at, scans left to right through slice.
|
||||
scan: *const T,
|
||||
// Counts the number of elements that went to the left side, also works around:
|
||||
// https://github.com/rust-lang/rust/issues/117128
|
||||
num_left: usize,
|
||||
// Reverse scratch output pointer.
|
||||
scratch_rev: *mut T,
|
||||
}
|
||||
|
||||
impl<T> PartitionState<T> {
|
||||
/// # Safety
|
||||
/// scan and scratch must point to valid disjoint buffers of length len. The
|
||||
/// scan buffer must be initialized.
|
||||
unsafe fn new(scan: *const T, scratch: *mut T, len: usize) -> Self {
|
||||
// SAFETY: See function safety comment.
|
||||
unsafe { Self { scratch_base: scratch, scan, num_left: 0, scratch_rev: scratch.add(len) } }
|
||||
}
|
||||
|
||||
/// Depending on the value of `towards_left` this function will write a value
|
||||
/// to the growing left or right side of the scratch memory. This forms the
|
||||
/// branchless core of the partition.
|
||||
///
|
||||
/// # Safety
|
||||
/// This function may be called at most `len` times. If it is called exactly
|
||||
/// `len` times the scratch buffer then contains a copy of each element from
|
||||
/// the scan buffer exactly once - a permutation, and num_left <= len.
|
||||
unsafe fn partition_one(&mut self, towards_left: bool) -> *mut T {
|
||||
// SAFETY: see individual comments.
|
||||
unsafe {
|
||||
// SAFETY: in-bounds because this function is called at most len times, and thus
|
||||
// right now is incremented at most len - 1 times. Similarly, num_left < len and
|
||||
// num_right < len, where num_right == i - num_left at the start of the ith
|
||||
// iteration (zero-indexed).
|
||||
self.scratch_rev = self.scratch_rev.sub(1);
|
||||
|
||||
// SAFETY: now we have scratch_rev == base + len - (i + 1). This means
|
||||
// scratch_rev + num_left == base + len - 1 - num_right < base + len.
|
||||
let dst_base = if towards_left { self.scratch_base } else { self.scratch_rev };
|
||||
let dst = dst_base.add(self.num_left);
|
||||
ptr::copy_nonoverlapping(self.scan, dst, 1);
|
||||
|
||||
self.num_left += towards_left as usize;
|
||||
self.scan = self.scan.add(1);
|
||||
dst
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[const_trait]
|
||||
trait IsFreeze {
|
||||
fn is_freeze() -> bool;
|
||||
}
|
||||
|
||||
impl<T> const IsFreeze for T {
|
||||
default fn is_freeze() -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
impl<T: FreezeMarker> const IsFreeze for T {
|
||||
fn is_freeze() -> bool {
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
const fn has_direct_interior_mutability<T>() -> bool {
|
||||
// If a type has interior mutability it may alter itself during comparison
|
||||
// in a way that must be preserved after the sort operation concludes.
|
||||
// Otherwise a type like Mutex<Option<Box<str>>> could lead to double free.
|
||||
!T::is_freeze()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn freeze_check() {
|
||||
assert!(!has_direct_interior_mutability::<u32>());
|
||||
assert!(!has_direct_interior_mutability::<[u128; 2]>());
|
||||
|
||||
assert!(has_direct_interior_mutability::<crate::cell::Cell<u32>>());
|
||||
assert!(has_direct_interior_mutability::<crate::sync::Mutex<u32>>());
|
||||
}
|
||||
@@ -0,0 +1,80 @@
|
||||
//! This module contains a branchless heapsort as fallback for unstable quicksort.
|
||||
|
||||
use crate::intrinsics;
|
||||
use crate::ptr;
|
||||
|
||||
/// Sorts `v` using heapsort, which guarantees *O*(*n* \* log(*n*)) worst-case.
|
||||
///
|
||||
/// Never inline this, it sits the main hot-loop in `recurse` and is meant as unlikely algorithmic
|
||||
/// fallback.
|
||||
///
|
||||
/// SAFETY: The caller has to guarantee that `v.len()` >= 2.
|
||||
#[inline(never)]
|
||||
pub(crate) unsafe fn heapsort<T, F>(v: &mut [T], is_less: &mut F)
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
// SAFETY: See function safety.
|
||||
unsafe {
|
||||
intrinsics::assume(v.len() >= 2);
|
||||
|
||||
// Build the heap in linear time.
|
||||
for i in (0..v.len() / 2).rev() {
|
||||
sift_down(v, i, is_less);
|
||||
}
|
||||
|
||||
// Pop maximal elements from the heap.
|
||||
for i in (1..v.len()).rev() {
|
||||
v.swap(0, i);
|
||||
sift_down(&mut v[..i], 0, is_less);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// This binary heap respects the invariant `parent >= child`.
|
||||
//
|
||||
// SAFETY: The caller has to guarantee that node < `v.len()`.
|
||||
#[inline(never)]
|
||||
unsafe fn sift_down<T, F>(v: &mut [T], mut node: usize, is_less: &mut F)
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
// SAFETY: See function safety.
|
||||
unsafe {
|
||||
intrinsics::assume(node < v.len());
|
||||
}
|
||||
|
||||
let len = v.len();
|
||||
|
||||
let v_base = v.as_mut_ptr();
|
||||
|
||||
loop {
|
||||
// Children of `node`.
|
||||
let mut child = 2 * node + 1;
|
||||
if child >= len {
|
||||
break;
|
||||
}
|
||||
|
||||
// SAFETY: The invariants and checks guarantee that both node and child are in-bounds.
|
||||
unsafe {
|
||||
// Choose the greater child.
|
||||
if child + 1 < len {
|
||||
// We need a branch to be sure not to out-of-bounds index,
|
||||
// but it's highly predictable. The comparison, however,
|
||||
// is better done branchless, especially for primitives.
|
||||
child += is_less(&*v_base.add(child), &*v_base.add(child + 1)) as usize;
|
||||
}
|
||||
|
||||
// Stop if the invariant holds at `node`.
|
||||
if !is_less(&*v_base.add(node), &*v_base.add(child)) {
|
||||
break;
|
||||
}
|
||||
|
||||
// Swap `node` with the greater child, move one step down, and continue sifting. This
|
||||
// could be ptr::swap_nonoverlapping but that adds a significant amount of binary-size.
|
||||
ptr::swap(v_base.add(node), v_base.add(child));
|
||||
}
|
||||
|
||||
node = child;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,76 @@
|
||||
//! This module contains the entry points for `slice::sort_unstable`.
|
||||
|
||||
use crate::intrinsics;
|
||||
use crate::mem::SizedTypeProperties;
|
||||
|
||||
use crate::slice::sort::shared::find_existing_run;
|
||||
use crate::slice::sort::shared::smallsort::insertion_sort_shift_left;
|
||||
|
||||
pub(crate) mod heapsort;
|
||||
pub(crate) mod quicksort;
|
||||
|
||||
/// Unstable sort called ipnsort by Lukas Bergdoll.
|
||||
/// Design document:
|
||||
/// https://github.com/Voultapher/sort-research-rs/blob/main/writeup/ipnsort_introduction/text.md
|
||||
///
|
||||
/// Upholds all safety properties outlined here:
|
||||
/// https://github.com/Voultapher/sort-research-rs/blob/main/writeup/sort_safety/text.md
|
||||
#[inline(always)]
|
||||
pub fn sort<T, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F) {
|
||||
// Arrays of zero-sized types are always all-equal, and thus sorted.
|
||||
if T::IS_ZST {
|
||||
return;
|
||||
}
|
||||
|
||||
// Instrumenting the standard library showed that 90+% of the calls to sort
|
||||
// by rustc are either of size 0 or 1.
|
||||
let len = v.len();
|
||||
if intrinsics::likely(len < 2) {
|
||||
return;
|
||||
}
|
||||
|
||||
// More advanced sorting methods than insertion sort are faster if called in
|
||||
// a hot loop for small inputs, but for general-purpose code the small
|
||||
// binary size of insertion sort is more important. The instruction cache in
|
||||
// modern processors is very valuable, and for a single sort call in general
|
||||
// purpose code any gains from an advanced method are cancelled by i-cache
|
||||
// misses during the sort, and thrashing the i-cache for surrounding code.
|
||||
const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
|
||||
if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
|
||||
insertion_sort_shift_left(v, 1, is_less);
|
||||
return;
|
||||
}
|
||||
|
||||
ipnsort(v, is_less);
|
||||
}
|
||||
|
||||
/// See [`sort`]
|
||||
///
|
||||
/// Deliberately don't inline the main sorting routine entrypoint to ensure the
|
||||
/// inlined insertion sort i-cache footprint remains minimal.
|
||||
#[inline(never)]
|
||||
fn ipnsort<T, F>(v: &mut [T], is_less: &mut F)
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
let len = v.len();
|
||||
let (run_len, was_reversed) = find_existing_run(v, is_less);
|
||||
|
||||
// SAFETY: find_existing_run promises to return a valid run_len.
|
||||
unsafe { intrinsics::assume(run_len <= len) };
|
||||
|
||||
if run_len == len {
|
||||
if was_reversed {
|
||||
v.reverse();
|
||||
}
|
||||
|
||||
// It would be possible to a do in-place merging here for a long existing streak. But that
|
||||
// makes the implementation a lot bigger, users can use `slice::sort` for that use-case.
|
||||
return;
|
||||
}
|
||||
|
||||
// Limit the number of imbalanced partitions to `2 * floor(log2(len))`.
|
||||
// The binary OR by one is used to eliminate the zero-check in the logarithm.
|
||||
let limit = 2 * (len | 1).ilog2();
|
||||
crate::slice::sort::unstable::quicksort::quicksort(v, None, limit, is_less);
|
||||
}
|
||||
@@ -0,0 +1,347 @@
|
||||
//! This module contains an unstable quicksort and two partition implementations.
|
||||
|
||||
use crate::intrinsics;
|
||||
use crate::mem::{self, ManuallyDrop};
|
||||
use crate::ptr;
|
||||
|
||||
use crate::slice::sort::shared::pivot::choose_pivot;
|
||||
use crate::slice::sort::shared::smallsort::UnstableSmallSortTypeImpl;
|
||||
|
||||
/// Sorts `v` recursively.
|
||||
///
|
||||
/// If the slice had a predecessor in the original array, it is specified as `ancestor_pivot`.
|
||||
///
|
||||
/// `limit` is the number of allowed imbalanced partitions before switching to `heapsort`. If zero,
|
||||
/// this function will immediately switch to heapsort.
|
||||
pub(crate) fn quicksort<'a, T, F>(
|
||||
mut v: &'a mut [T],
|
||||
mut ancestor_pivot: Option<&'a T>,
|
||||
mut limit: u32,
|
||||
is_less: &mut F,
|
||||
) where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
loop {
|
||||
if v.len() <= T::small_sort_threshold() {
|
||||
T::small_sort(v, is_less);
|
||||
return;
|
||||
}
|
||||
|
||||
// If too many bad pivot choices were made, simply fall back to heapsort in order to
|
||||
// guarantee `O(N x log(N))` worst-case.
|
||||
if limit == 0 {
|
||||
// SAFETY: We assume the `small_sort` threshold is at least 1.
|
||||
unsafe {
|
||||
crate::slice::sort::unstable::heapsort::heapsort(v, is_less);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
limit -= 1;
|
||||
|
||||
// Choose a pivot and try guessing whether the slice is already sorted.
|
||||
let pivot_pos = choose_pivot(v, is_less);
|
||||
|
||||
// If the chosen pivot is equal to the predecessor, then it's the smallest element in the
|
||||
// slice. Partition the slice into elements equal to and elements greater than the pivot.
|
||||
// This case is usually hit when the slice contains many duplicate elements.
|
||||
if let Some(p) = ancestor_pivot {
|
||||
// SAFETY: We assume choose_pivot yields an in-bounds position.
|
||||
if !is_less(p, unsafe { v.get_unchecked(pivot_pos) }) {
|
||||
let num_lt = partition(v, pivot_pos, &mut |a, b| !is_less(b, a));
|
||||
|
||||
// Continue sorting elements greater than the pivot. We know that `num_lt` contains
|
||||
// the pivot. So we can continue after `num_lt`.
|
||||
v = &mut v[(num_lt + 1)..];
|
||||
ancestor_pivot = None;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Partition the slice.
|
||||
let num_lt = partition(v, pivot_pos, is_less);
|
||||
// SAFETY: partition ensures that `num_lt` will be in-bounds.
|
||||
unsafe { intrinsics::assume(num_lt < v.len()) };
|
||||
|
||||
// Split the slice into `left`, `pivot`, and `right`.
|
||||
let (left, right) = v.split_at_mut(num_lt);
|
||||
let (pivot, right) = right.split_at_mut(1);
|
||||
let pivot = &pivot[0];
|
||||
|
||||
// Recurse into the left side. We have a fixed recursion limit, testing shows no real
|
||||
// benefit for recursing into the shorter side.
|
||||
quicksort(left, ancestor_pivot, limit, is_less);
|
||||
|
||||
// Continue with the right side.
|
||||
v = right;
|
||||
ancestor_pivot = Some(pivot);
|
||||
}
|
||||
}
|
||||
|
||||
/// Takes the input slice `v` and re-arranges elements such that when the call returns normally
|
||||
/// all elements that compare true for `is_less(elem, pivot)` where `pivot == v[pivot_pos]` are
|
||||
/// on the left side of `v` followed by the other elements, notionally considered greater or
|
||||
/// equal to `pivot`.
|
||||
///
|
||||
/// Returns the number of elements that are compared true for `is_less(elem, pivot)`.
|
||||
///
|
||||
/// If `is_less` does not implement a total order the resulting order and return value are
|
||||
/// unspecified. All original elements will remain in `v` and any possible modifications via
|
||||
/// interior mutability will be observable. Same is true if `is_less` panics or `v.len()`
|
||||
/// exceeds `scratch.len()`.
|
||||
pub(crate) fn partition<T, F>(v: &mut [T], pivot: usize, is_less: &mut F) -> usize
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
let len = v.len();
|
||||
|
||||
// Allows for panic-free code-gen by proving this property to the compiler.
|
||||
if len == 0 {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Allows for panic-free code-gen by proving this property to the compiler.
|
||||
if pivot >= len {
|
||||
intrinsics::abort();
|
||||
}
|
||||
|
||||
// Place the pivot at the beginning of slice.
|
||||
v.swap(0, pivot);
|
||||
let (pivot, v_without_pivot) = v.split_at_mut(1);
|
||||
|
||||
// Assuming that Rust generates noalias LLVM IR we can be sure that a partition function
|
||||
// signature of the form `(v: &mut [T], pivot: &T)` guarantees that pivot and v can't alias.
|
||||
// Having this guarantee is crucial for optimizations. It's possible to copy the pivot value
|
||||
// into a stack value, but this creates issues for types with interior mutability mandating
|
||||
// a drop guard.
|
||||
let pivot = &mut pivot[0];
|
||||
|
||||
// This construct is used to limit the LLVM IR generated, which saves large amounts of
|
||||
// compile-time by only instantiating the code that is needed. Idea by Frank Steffahn.
|
||||
let num_lt = (const { inst_partition::<T, F>() })(v_without_pivot, pivot, is_less);
|
||||
|
||||
// Place the pivot between the two partitions.
|
||||
v.swap(0, num_lt);
|
||||
|
||||
num_lt
|
||||
}
|
||||
|
||||
const fn inst_partition<T, F: FnMut(&T, &T) -> bool>() -> fn(&mut [T], &T, &mut F) -> usize {
|
||||
const MAX_BRANCHLESS_PARTITION_SIZE: usize = 96;
|
||||
if mem::size_of::<T>() <= MAX_BRANCHLESS_PARTITION_SIZE {
|
||||
// Specialize for types that are relatively cheap to copy, where branchless optimizations
|
||||
// have large leverage e.g. `u64` and `String`.
|
||||
partition_lomuto_branchless_cyclic::<T, F>
|
||||
} else {
|
||||
partition_hoare_branchy_cyclic::<T, F>
|
||||
}
|
||||
}
|
||||
|
||||
/// See [`partition`].
|
||||
fn partition_hoare_branchy_cyclic<T, F>(v: &mut [T], pivot: &T, is_less: &mut F) -> usize
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
let len = v.len();
|
||||
|
||||
if len == 0 {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Optimized for large types that are expensive to move. Not optimized for integers. Optimized
|
||||
// for small code-gen, assuming that is_less is an expensive operation that generates
|
||||
// substantial amounts of code or a call. And that copying elements will likely be a call to
|
||||
// memcpy. Using 2 `ptr::copy_nonoverlapping` has the chance to be faster than
|
||||
// `ptr::swap_nonoverlapping` because `memcpy` can use wide SIMD based on runtime feature
|
||||
// detection. Benchmarks support this analysis.
|
||||
|
||||
let mut gap_opt: Option<GapGuard<T>> = None;
|
||||
|
||||
// SAFETY: The left-to-right scanning loop performs a bounds check, where we know that `left >=
|
||||
// v_base && left < right && right <= v_base.add(len)`. The right-to-left scanning loop performs
|
||||
// a bounds check ensuring that `right` is in-bounds. We checked that `len` is more than zero,
|
||||
// which means that unconditional `right = right.sub(1)` is safe to do. The exit check makes
|
||||
// sure that `left` and `right` never alias, making `ptr::copy_nonoverlapping` safe. The
|
||||
// drop-guard `gap` ensures that should `is_less` panic we always overwrite the duplicate in the
|
||||
// input. `gap.pos` stores the previous value of `right` and starts at `right` and so it too is
|
||||
// in-bounds. We never pass the saved `gap.value` to `is_less` while it is inside the `GapGuard`
|
||||
// thus any changes via interior mutability will be observed.
|
||||
unsafe {
|
||||
let v_base = v.as_mut_ptr();
|
||||
|
||||
let mut left = v_base;
|
||||
let mut right = v_base.add(len);
|
||||
|
||||
loop {
|
||||
// Find the first element greater than the pivot.
|
||||
while left < right && is_less(&*left, pivot) {
|
||||
left = left.add(1);
|
||||
}
|
||||
|
||||
// Find the last element equal to the pivot.
|
||||
loop {
|
||||
right = right.sub(1);
|
||||
if left >= right || is_less(&*right, pivot) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if left >= right {
|
||||
break;
|
||||
}
|
||||
|
||||
// Swap the found pair of out-of-order elements via cyclic permutation.
|
||||
let is_first_swap_pair = gap_opt.is_none();
|
||||
|
||||
if is_first_swap_pair {
|
||||
gap_opt = Some(GapGuard { pos: right, value: ManuallyDrop::new(ptr::read(left)) });
|
||||
}
|
||||
|
||||
let gap = gap_opt.as_mut().unwrap_unchecked();
|
||||
|
||||
// Single place where we instantiate ptr::copy_nonoverlapping in the partition.
|
||||
if !is_first_swap_pair {
|
||||
ptr::copy_nonoverlapping(left, gap.pos, 1);
|
||||
}
|
||||
gap.pos = right;
|
||||
ptr::copy_nonoverlapping(right, left, 1);
|
||||
|
||||
left = left.add(1);
|
||||
}
|
||||
|
||||
left.sub_ptr(v_base)
|
||||
|
||||
// `gap_opt` goes out of scope and overwrites the last wrong-side element on the right side
|
||||
// with the first wrong-side element of the left side that was initially overwritten by the
|
||||
// first wrong-side element on the right side element.
|
||||
}
|
||||
}
|
||||
|
||||
struct PartitionState<T> {
|
||||
// The current element that is being looked at, scans left to right through slice.
|
||||
right: *mut T,
|
||||
// Counts the number of elements that compared less-than, also works around:
|
||||
// https://github.com/rust-lang/rust/issues/117128
|
||||
num_lt: usize,
|
||||
// Gap guard that tracks the temporary duplicate in the input.
|
||||
gap: GapGuardRaw<T>,
|
||||
}
|
||||
|
||||
fn partition_lomuto_branchless_cyclic<T, F>(v: &mut [T], pivot: &T, is_less: &mut F) -> usize
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
// Novel partition implementation by Lukas Bergdoll and Orson Peters. Branchless Lomuto
|
||||
// partition paired with a cyclic permutation.
|
||||
// https://github.com/Voultapher/sort-research-rs/blob/main/writeup/lomcyc_partition/text.md
|
||||
|
||||
let len = v.len();
|
||||
let v_base = v.as_mut_ptr();
|
||||
|
||||
if len == 0 {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// SAFETY: We checked that `len` is more than zero, which means that reading `v_base` is safe to
|
||||
// do. From there we have a bounded loop where `v_base.add(i)` is guaranteed in-bounds. `v` and
|
||||
// `pivot` can't alias because of type system rules. The drop-guard `gap` ensures that should
|
||||
// `is_less` panic we always overwrite the duplicate in the input. `gap.pos` stores the previous
|
||||
// value of `right` and starts at `v_base` and so it too is in-bounds. Given `UNROLL_LEN == 2`
|
||||
// after the main loop we either have A) the last element in `v` that has not yet been processed
|
||||
// because `len % 2 != 0`, or B) all elements have been processed except the gap value that was
|
||||
// saved at the beginning with `ptr::read(v_base)`. In the case A) the loop will iterate twice,
|
||||
// first performing loop_body to take care of the last element that didn't fit into the unroll.
|
||||
// After that the behavior is the same as for B) where we use the saved value as `right` to
|
||||
// overwrite the duplicate. If this very last call to `is_less` panics the saved value will be
|
||||
// copied back including all possible changes via interior mutability. If `is_less` does not
|
||||
// panic and the code continues we overwrite the duplicate and do `right = right.add(1)`, this
|
||||
// is safe to do with `&mut *gap.value` because `T` is the same as `[T; 1]` and generating a
|
||||
// pointer one past the allocation is safe.
|
||||
unsafe {
|
||||
let mut loop_body = |state: &mut PartitionState<T>| {
|
||||
let right_is_lt = is_less(&*state.right, pivot);
|
||||
let left = v_base.add(state.num_lt);
|
||||
|
||||
ptr::copy(left, state.gap.pos, 1);
|
||||
ptr::copy_nonoverlapping(state.right, left, 1);
|
||||
|
||||
state.gap.pos = state.right;
|
||||
state.num_lt += right_is_lt as usize;
|
||||
|
||||
state.right = state.right.add(1);
|
||||
};
|
||||
|
||||
// Ideally we could just use GapGuard in PartitionState, but the reference that is
|
||||
// materialized with `&mut state` when calling `loop_body` would create a mutable reference
|
||||
// to the parent struct that contains the gap value, invalidating the reference pointer
|
||||
// created from a reference to the gap value in the cleanup loop. This is only an issue
|
||||
// under Stacked Borrows, Tree Borrows accepts the intuitive code using GapGuard as valid.
|
||||
let mut gap_value = ManuallyDrop::new(ptr::read(v_base));
|
||||
|
||||
let mut state = PartitionState {
|
||||
num_lt: 0,
|
||||
right: v_base.add(1),
|
||||
|
||||
gap: GapGuardRaw { pos: v_base, value: &mut *gap_value },
|
||||
};
|
||||
|
||||
// Manual unrolling that works well on x86, Arm and with opt-level=s without murdering
|
||||
// compile-times. Leaving this to the compiler yields ok to bad results.
|
||||
let unroll_len = const { if mem::size_of::<T>() <= 16 { 2 } else { 1 } };
|
||||
|
||||
let unroll_end = v_base.add(len - (unroll_len - 1));
|
||||
while state.right < unroll_end {
|
||||
if unroll_len == 2 {
|
||||
loop_body(&mut state);
|
||||
loop_body(&mut state);
|
||||
} else {
|
||||
loop_body(&mut state);
|
||||
}
|
||||
}
|
||||
|
||||
// Single instantiate `loop_body` for both the unroll cleanup and cyclic permutation
|
||||
// cleanup. Optimizes binary-size and compile-time.
|
||||
let end = v_base.add(len);
|
||||
loop {
|
||||
let is_done = state.right == end;
|
||||
state.right = if is_done { state.gap.value } else { state.right };
|
||||
|
||||
loop_body(&mut state);
|
||||
|
||||
if is_done {
|
||||
mem::forget(state.gap);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
state.num_lt
|
||||
}
|
||||
}
|
||||
|
||||
struct GapGuard<T> {
|
||||
pos: *mut T,
|
||||
value: ManuallyDrop<T>,
|
||||
}
|
||||
|
||||
impl<T> Drop for GapGuard<T> {
|
||||
fn drop(&mut self) {
|
||||
unsafe {
|
||||
ptr::copy_nonoverlapping(&*self.value, self.pos, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Ideally this wouldn't be needed and we could just use the regular GapGuard.
|
||||
/// See comment in [`partition_lomuto_branchless_cyclic`].
|
||||
struct GapGuardRaw<T> {
|
||||
pos: *mut T,
|
||||
value: *mut T,
|
||||
}
|
||||
|
||||
impl<T> Drop for GapGuardRaw<T> {
|
||||
fn drop(&mut self) {
|
||||
unsafe {
|
||||
ptr::copy_nonoverlapping(self.value, self.pos, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -49,7 +49,6 @@
|
||||
#![feature(is_sorted)]
|
||||
#![feature(layout_for_ptr)]
|
||||
#![feature(pattern)]
|
||||
#![feature(sort_internals)]
|
||||
#![feature(slice_take)]
|
||||
#![feature(slice_from_ptr_range)]
|
||||
#![feature(slice_split_once)]
|
||||
|
||||
@@ -1803,9 +1803,8 @@ fn brute_force_rotate_test_1() {
|
||||
#[test]
|
||||
#[cfg(not(target_arch = "wasm32"))]
|
||||
fn sort_unstable() {
|
||||
use core::cmp::Ordering::{Equal, Greater, Less};
|
||||
use core::slice::heapsort;
|
||||
use rand::{seq::SliceRandom, Rng};
|
||||
// use core::cmp::Ordering::{Equal, Greater, Less};
|
||||
use rand::Rng;
|
||||
|
||||
// Miri is too slow (but still need to `chain` to make the types match)
|
||||
let lens = if cfg!(miri) { (2..20).chain(0..0) } else { (2..25).chain(500..510) };
|
||||
@@ -1839,31 +1838,10 @@ fn sort_unstable() {
|
||||
tmp.copy_from_slice(v);
|
||||
tmp.sort_unstable_by(|a, b| b.cmp(a));
|
||||
assert!(tmp.windows(2).all(|w| w[0] >= w[1]));
|
||||
|
||||
// Test heapsort using `<` operator.
|
||||
tmp.copy_from_slice(v);
|
||||
heapsort(tmp, |a, b| a < b);
|
||||
assert!(tmp.windows(2).all(|w| w[0] <= w[1]));
|
||||
|
||||
// Test heapsort using `>` operator.
|
||||
tmp.copy_from_slice(v);
|
||||
heapsort(tmp, |a, b| a > b);
|
||||
assert!(tmp.windows(2).all(|w| w[0] >= w[1]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Sort using a completely random comparison function.
|
||||
// This will reorder the elements *somehow*, but won't panic.
|
||||
for i in 0..v.len() {
|
||||
v[i] = i as i32;
|
||||
}
|
||||
v.sort_unstable_by(|_, _| *[Less, Equal, Greater].choose(&mut rng).unwrap());
|
||||
v.sort_unstable();
|
||||
for i in 0..v.len() {
|
||||
assert_eq!(v[i], i as i32);
|
||||
}
|
||||
|
||||
// Should not panic.
|
||||
[0i32; 0].sort_unstable();
|
||||
[(); 10].sort_unstable();
|
||||
|
||||
Reference in New Issue
Block a user