Replace sort implementations

- `slice::sort` -> driftsort https://github.com/Voultapher/sort-research-rs/blob/main/writeup/driftsort_introduction/text.md - `slice::sort_unstable` -> ipnsort https://github.com/Voultapher/sort-research-rs/blob/main/writeup/ipnsort_introduction/text.md Replaces the sort implementations with tailor made ones that strike a balance of run-time, compile-time and binary-size, yielding run-time and compile-time improvements. Regressing binary-size for `slice::sort` while improving it for `slice::sort_unstable`. All while upholding the existing soft and hard safety guarantees, and even extending the soft guarantees, detecting strict weak ordering violations with a high chance and reporting it to users via a panic. In addition the implementation of `select_nth_unstable` is also adapted as it uses `slice::sort_unstable` internals.
2026-05-08 01:28:18 +03:00 · 2024-04-16 20:24:21 +02:00
parent 4a78c00e22
commit e49be415cd
18 changed files with 2621 additions and 1688 deletions
@@ -16,7 +16,7 @@
 #[cfg(not(no_global_oom_handling))]
 use core::cmp::Ordering::{self, Less};
 #[cfg(not(no_global_oom_handling))]
-use core::mem::{self, SizedTypeProperties};
+use core::mem::{self, MaybeUninit};
 #[cfg(not(no_global_oom_handling))]
 use core::ptr;
 #[cfg(not(no_global_oom_handling))]
@@ -24,7 +24,7 @@

 use crate::alloc::Allocator;
 #[cfg(not(no_global_oom_handling))]
-use crate::alloc::{self, Global};
+use crate::alloc::Global;
 #[cfg(not(no_global_oom_handling))]
 use crate::borrow::ToOwned;
 use crate::boxed::Box;
@@ -174,23 +174,32 @@ fn to_vec<A: Allocator>(s: &[Self], alloc: A) -> Vec<Self, A> {

 #[cfg(not(test))]
 impl<T> [T] {
-    /// Sorts the slice.
+    /// Sorts the slice, preserving initial order of equal elements.
    ///
-    /// This sort is stable (i.e., does not reorder equal elements) and *O*(*n* \* log(*n*)) worst-case.
+    /// This sort is stable (i.e., does not reorder equal elements) and *O*(*n* \* log(*n*))
+    /// worst-case.
+    ///
+    /// If `T: Ord` does not implement a total order the resulting order is unspecified. All
+    /// original elements will remain in the slice and any possible modifications via interior
+    /// mutability are observed in the input. Same is true if `T: Ord` panics.
    ///
    /// When applicable, unstable sorting is preferred because it is generally faster than stable
-    /// sorting and it doesn't allocate auxiliary memory.
-    /// See [`sort_unstable`](slice::sort_unstable).
+    /// sorting and it doesn't allocate auxiliary memory. See
+    /// [`sort_unstable`](slice::sort_unstable). The exception are partially sorted slices, which
+    /// may be better served with `slice::sort`.
    ///
    /// # Current implementation
    ///
-    /// The current algorithm is an adaptive, iterative merge sort inspired by
-    /// [timsort](https://en.wikipedia.org/wiki/Timsort).
-    /// It is designed to be very fast in cases where the slice is nearly sorted, or consists of
-    /// two or more sorted sequences concatenated one after another.
+    /// The current implementation is based on [driftsort] by Orson Peters and Lukas Bergdoll, which
+    /// combines the fast average case of quicksort with the fast worst case and partial run
+    /// detection of mergesort, achieving linear time on fully sorted and reversed inputs. On inputs
+    /// with k distinct elements, the expected time to sort the data is *O(*n* log(*k*))*.
    ///
-    /// Also, it allocates temporary storage half the size of `self`, but for short slices a
-    /// non-allocating insertion sort is used instead.
+    /// The auxiliary memory allocation behavior depends on the input length. Short slices are
+    /// handled without allocation, medium sized slices allocate `self.len()` and beyond that it
+    /// clamps at `self.len() / 2`.
+    ///
+    /// If `T: Ord` does not implement a total order, the implementation may panic.
    ///
    /// # Examples
    ///
@@ -200,6 +209,8 @@ impl<T> [T] {
    /// v.sort();
    /// assert!(v == [-5, -3, 1, 2, 4]);
    /// ```
+    ///
+    /// [driftsort]: https://github.com/Voultapher/driftsort
    #[cfg(not(no_global_oom_handling))]
    #[rustc_allow_incoherent_impl]
    #[stable(feature = "rust1", since = "1.0.0")]
@@ -211,13 +222,18 @@ pub fn sort(&mut self)
        stable_sort(self, T::lt);
    }

-    /// Sorts the slice with a comparator function.
+    /// Sorts the slice with a comparator function, preserving initial order of equal elements.
    ///
-    /// This sort is stable (i.e., does not reorder equal elements) and *O*(*n* \* log(*n*)) worst-case.
+    /// This sort is stable (i.e., does not reorder equal elements) and *O*(*n* \* log(*n*))
+    /// worst-case.
    ///
-    /// The comparator function must define a total ordering for the elements in the slice. If
-    /// the ordering is not total, the order of the elements is unspecified. An order is a
-    /// total order if it is (for all `a`, `b` and `c`):
+    /// The comparator function should define a total ordering for the elements in the slice. If the
+    /// ordering is not total, the order of the elements is unspecified.
+    ///
+    /// If the comparator function does not implement a total order the resulting order is
+    /// unspecified. All original elements will remain in the slice and any possible modifications
+    /// via interior mutability are observed in the input. Same is true if the comparator function
+    /// panics. A total order (for all `a`, `b` and `c`):
    ///
    /// * total and antisymmetric: exactly one of `a < b`, `a == b` or `a > b` is true, and
    /// * transitive, `a < b` and `b < c` implies `a < c`. The same must hold for both `==` and `>`.
@@ -227,23 +243,22 @@ pub fn sort(&mut self)
    ///
    /// ```
    /// let mut floats = [5f64, 4.0, 1.0, 3.0, 2.0];
-    /// floats.sort_by(|a, b| a.partial_cmp(b).unwrap());
+    /// floats.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap());
    /// assert_eq!(floats, [1.0, 2.0, 3.0, 4.0, 5.0]);
    /// ```
    ///
-    /// When applicable, unstable sorting is preferred because it is generally faster than stable
-    /// sorting and it doesn't allocate auxiliary memory.
-    /// See [`sort_unstable_by`](slice::sort_unstable_by).
-    ///
    /// # Current implementation
    ///
-    /// The current algorithm is an adaptive, iterative merge sort inspired by
-    /// [timsort](https://en.wikipedia.org/wiki/Timsort).
-    /// It is designed to be very fast in cases where the slice is nearly sorted, or consists of
-    /// two or more sorted sequences concatenated one after another.
+    /// The current implementation is based on [driftsort] by Orson Peters and Lukas Bergdoll, which
+    /// combines the fast average case of quicksort with the fast worst case and partial run
+    /// detection of mergesort, achieving linear time on fully sorted and reversed inputs. On inputs
+    /// with k distinct elements, the expected time to sort the data is *O(*n* log(*k*))*.
    ///
-    /// Also, it allocates temporary storage half the size of `self`, but for short slices a
-    /// non-allocating insertion sort is used instead.
+    /// The auxiliary memory allocation behavior depends on the input length. Short slices are
+    /// handled without allocation, medium sized slices allocate `self.len()` and beyond that it
+    /// clamps at `self.len() / 2`.
+    ///
+    /// If `T: Ord` does not implement a total order, the implementation may panic.
    ///
    /// # Examples
    ///
@@ -256,6 +271,8 @@ pub fn sort(&mut self)
    /// v.sort_by(|a, b| b.cmp(a));
    /// assert!(v == [5, 4, 3, 2, 1]);
    /// ```
+    ///
+    /// [driftsort]: https://github.com/Voultapher/driftsort
    #[cfg(not(no_global_oom_handling))]
    #[rustc_allow_incoherent_impl]
    #[stable(feature = "rust1", since = "1.0.0")]
@@ -267,28 +284,27 @@ pub fn sort_by<F>(&mut self, mut compare: F)
        stable_sort(self, |a, b| compare(a, b) == Less);
    }

-    /// Sorts the slice with a key extraction function.
+    /// Sorts the slice with a key extraction function, preserving initial order of equal elements.
    ///
    /// This sort is stable (i.e., does not reorder equal elements) and *O*(*m* \* *n* \* log(*n*))
    /// worst-case, where the key function is *O*(*m*).
    ///
-    /// For expensive key functions (e.g. functions that are not simple property accesses or
-    /// basic operations), [`sort_by_cached_key`](slice::sort_by_cached_key) is likely to be
-    /// significantly faster, as it does not recompute element keys.
-    ///
-    /// When applicable, unstable sorting is preferred because it is generally faster than stable
-    /// sorting and it doesn't allocate auxiliary memory.
-    /// See [`sort_unstable_by_key`](slice::sort_unstable_by_key).
+    /// If `K: Ord` does not implement a total order the resulting order is unspecified.
+    /// All original elements will remain in the slice and any possible modifications via interior
+    /// mutability are observed in the input. Same is true if `K: Ord` panics.
    ///
    /// # Current implementation
    ///
-    /// The current algorithm is an adaptive, iterative merge sort inspired by
-    /// [timsort](https://en.wikipedia.org/wiki/Timsort).
-    /// It is designed to be very fast in cases where the slice is nearly sorted, or consists of
-    /// two or more sorted sequences concatenated one after another.
+    /// The current implementation is based on [driftsort] by Orson Peters and Lukas Bergdoll, which
+    /// combines the fast average case of quicksort with the fast worst case and partial run
+    /// detection of mergesort, achieving linear time on fully sorted and reversed inputs. On inputs
+    /// with k distinct elements, the expected time to sort the data is *O(*n* log(*k*))*.
    ///
-    /// Also, it allocates temporary storage half the size of `self`, but for short slices a
-    /// non-allocating insertion sort is used instead.
+    /// The auxiliary memory allocation behavior depends on the input length. Short slices are
+    /// handled without allocation, medium sized slices allocate `self.len()` and beyond that it
+    /// clamps at `self.len() / 2`.
+    ///
+    /// If `K: Ord` does not implement a total order, the implementation may panic.
    ///
    /// # Examples
    ///
@@ -298,6 +314,8 @@ pub fn sort_by<F>(&mut self, mut compare: F)
    /// v.sort_by_key(|k| k.abs());
    /// assert!(v == [1, 2, -3, 4, -5]);
    /// ```
+    ///
+    /// [driftsort]: https://github.com/Voultapher/driftsort
    #[cfg(not(no_global_oom_handling))]
    #[rustc_allow_incoherent_impl]
    #[stable(feature = "slice_sort_by_key", since = "1.7.0")]
@@ -310,27 +328,30 @@ pub fn sort_by_key<K, F>(&mut self, mut f: F)
        stable_sort(self, |a, b| f(a).lt(&f(b)));
    }

-    /// Sorts the slice with a key extraction function.
+    /// Sorts the slice with a key extraction function, preserving initial order of equal elements.
    ///
-    /// During sorting, the key function is called at most once per element, by using
-    /// temporary storage to remember the results of key evaluation.
-    /// The order of calls to the key function is unspecified and may change in future versions
-    /// of the standard library.
+    /// This sort is stable (i.e., does not reorder equal elements) and *O*(*m* \* *n* + *n* \*
+    /// log(*n*)) worst-case, where the key function is *O*(*m*).
    ///
-    /// This sort is stable (i.e., does not reorder equal elements) and *O*(*m* \* *n* + *n* \* log(*n*))
-    /// worst-case, where the key function is *O*(*m*).
+    /// During sorting, the key function is called at most once per element, by using temporary
+    /// storage to remember the results of key evaluation. The order of calls to the key function is
+    /// unspecified and may change in future versions of the standard library.
    ///
-    /// For simple key functions (e.g., functions that are property accesses or
-    /// basic operations), [`sort_by_key`](slice::sort_by_key) is likely to be
-    /// faster.
+    /// If `K: Ord` does not implement a total order the resulting order is unspecified.
+    /// All original elements will remain in the slice and any possible modifications via interior
+    /// mutability are observed in the input. Same is true if `K: Ord` panics.
+    ///
+    /// For simple key functions (e.g., functions that are property accesses or basic operations),
+    /// [`sort_by_key`](slice::sort_by_key) is likely to be faster.
    ///
    /// # Current implementation
    ///
-    /// The current algorithm is based on [pattern-defeating quicksort][pdqsort] by Orson Peters,
-    /// which combines the fast average case of randomized quicksort with the fast worst case of
-    /// heapsort, while achieving linear time on slices with certain patterns. It uses some
-    /// randomization to avoid degenerate cases, but with a fixed seed to always provide
-    /// deterministic behavior.
+    /// The current implementation is based on [instruction-parallel-network sort][ipnsort] by Lukas
+    /// Bergdoll, which combines the fast average case of randomized quicksort with the fast worst
+    /// case of heapsort, while achieving linear time on fully sorted and reversed inputs. And
+    /// *O*(*k* \* log(*n*)) where *k* is the number of distinct elements in the input. It leverages
+    /// superscalar out-of-order execution capabilities commonly found in CPUs, to efficiently
+    /// perform the operation.
    ///
    /// In the worst case, the algorithm allocates temporary storage in a `Vec<(K, usize)>` the
    /// length of the slice.
@@ -344,7 +365,7 @@ pub fn sort_by_key<K, F>(&mut self, mut f: F)
    /// assert!(v == [-3, -5, 2, 32, 4]);
    /// ```
    ///
-    /// [pdqsort]: https://github.com/orlp/pdqsort
+    /// [ipnsort]: https://github.com/Voultapher/sort-research-rs/tree/main/ipnsort
    #[cfg(not(no_global_oom_handling))]
    #[rustc_allow_incoherent_impl]
    #[stable(feature = "slice_sort_by_cached_key", since = "1.34.0")]
@@ -361,7 +382,7 @@ macro_rules! sort_by_key {
                    $slice.iter().map($f).enumerate().map(|(i, k)| (k, i as $t)).collect();
                // The elements of `indices` are unique, as they are indexed, so any sort will be
                // stable with respect to the original slice. We use `sort_unstable` here because
-                // it requires less memory allocation.
+                // it requires no memory allocation.
                indices.sort_unstable();
                for i in 0..$slice.len() {
                    let mut index = indices[i].1;
@@ -374,24 +395,24 @@ macro_rules! sort_by_key {
            }};
        }

-        let sz_u8 = mem::size_of::<(K, u8)>();
-        let sz_u16 = mem::size_of::<(K, u16)>();
-        let sz_u32 = mem::size_of::<(K, u32)>();
-        let sz_usize = mem::size_of::<(K, usize)>();
-
        let len = self.len();
        if len < 2 {
            return;
        }
-        if sz_u8 < sz_u16 && len <= (u8::MAX as usize) {
-            return sort_by_key!(u8, self, f);
-        }
-        if sz_u16 < sz_u32 && len <= (u16::MAX as usize) {
-            return sort_by_key!(u16, self, f);
-        }
-        if sz_u32 < sz_usize && len <= (u32::MAX as usize) {
+
+        // Avoids binary-size usage in cases where the alignment doesn't work out to make this
+        // beneficial or on 32-bit platforms.
+        let is_using_u32_as_idx_type_helpful =
+            const { mem::size_of::<(K, u32)>() < mem::size_of::<(K, usize)>() };
+
+        // It's possible to instantiate this for u8 and u16 but, doing so is very wasteful in terms
+        // of compile-times and binary-size, the peak saved heap memory for u16 is (u8 + u16) -> 4
+        // bytes * u16::MAX vs (u8 + u32) -> 8 bytes * u16::MAX, the saved heap memory is at peak
+        // ~262KB.
+        if is_using_u32_as_idx_type_helpful && len <= (u32::MAX as usize) {
            return sort_by_key!(u32, self, f);
        }
+
        sort_by_key!(usize, self, f)
    }

@@ -843,46 +864,18 @@ fn stable_sort<T, F>(v: &mut [T], mut is_less: F)
 where
    F: FnMut(&T, &T) -> bool,
 {
-    if T::IS_ZST {
-        // Sorting has no meaningful behavior on zero-sized types. Do nothing.
-        return;
+    use sort::stable::BufGuard;
+
+    #[unstable(issue = "none", feature = "std_internals")]
+    impl<T> BufGuard<T> for Vec<T> {
+        fn with_capacity(capacity: usize) -> Self {
+            Vec::with_capacity(capacity)
+        }
+
+        fn as_uninit_slice_mut(&mut self) -> &mut [MaybeUninit<T>] {
+            self.spare_capacity_mut()
+        }
    }

-    let elem_alloc_fn = |len: usize| -> *mut T {
-        // SAFETY: Creating the layout is safe as long as merge_sort never calls this with len >
-        // v.len(). Alloc in general will only be used as 'shadow-region' to store temporary swap
-        // elements.
-        unsafe { alloc::alloc(alloc::Layout::array::<T>(len).unwrap_unchecked()) as *mut T }
-    };
-
-    let elem_dealloc_fn = |buf_ptr: *mut T, len: usize| {
-        // SAFETY: Creating the layout is safe as long as merge_sort never calls this with len >
-        // v.len(). The caller must ensure that buf_ptr was created by elem_alloc_fn with the same
-        // len.
-        unsafe {
-            alloc::dealloc(buf_ptr as *mut u8, alloc::Layout::array::<T>(len).unwrap_unchecked());
-        }
-    };
-
-    let run_alloc_fn = |len: usize| -> *mut sort::TimSortRun {
-        // SAFETY: Creating the layout is safe as long as merge_sort never calls this with an
-        // obscene length or 0.
-        unsafe {
-            alloc::alloc(alloc::Layout::array::<sort::TimSortRun>(len).unwrap_unchecked())
-                as *mut sort::TimSortRun
-        }
-    };
-
-    let run_dealloc_fn = |buf_ptr: *mut sort::TimSortRun, len: usize| {
-        // SAFETY: The caller must ensure that buf_ptr was created by elem_alloc_fn with the same
-        // len.
-        unsafe {
-            alloc::dealloc(
-                buf_ptr as *mut u8,
-                alloc::Layout::array::<sort::TimSortRun>(len).unwrap_unchecked(),
-            );
-        }
-    };
-
-    sort::merge_sort(v, &mut is_less, elem_alloc_fn, elem_dealloc_fn, run_alloc_fn, run_dealloc_fn);
+    sort::stable::sort::<T, F, Vec<T>>(v, &mut is_less);
 }
@@ -34,7 +34,7 @@ macro_rules! do_test {
            }

            let v = $input.to_owned();
-            let _ = std::panic::catch_unwind(move || {
+            let _ = panic::catch_unwind(move || {
                let mut v = v;
                let mut panic_countdown = panic_countdown;
                v.$func(|a, b| {
@@ -197,8 +197,7 @@ fn panic_safe() {

    let mut rng = test_rng();

-    // Miri is too slow (but still need to `chain` to make the types match)
-    let lens = if cfg!(miri) { (1..10).chain(0..0) } else { (1..20).chain(70..MAX_LEN) };
+    let lens = if cfg!(miri) { (1..10).chain(30..36) } else { (1..20).chain(70..MAX_LEN) };
    let moduli: &[u32] = if cfg!(miri) { &[5] } else { &[5, 20, 50] };

    for len in lens {
@@ -294,15 +293,20 @@ fn test_sort() {
        }
    }

-    // Sort using a completely random comparison function.
-    // This will reorder the elements *somehow*, but won't panic.
-    let mut v = [0; 500];
-    for i in 0..v.len() {
+    const ORD_VIOLATION_MAX_LEN: usize = 500;
+    let mut v = [0; ORD_VIOLATION_MAX_LEN];
+    for i in 0..ORD_VIOLATION_MAX_LEN {
        v[i] = i as i32;
    }
-    v.sort_by(|_, _| *[Less, Equal, Greater].choose(&mut rng).unwrap());
+
+    // Sort using a completely random comparison function. This will reorder the elements *somehow*,
+    // it may panic but the original elements must still be present.
+    let _ = panic::catch_unwind(move || {
+        v.sort_by(|_, _| *[Less, Equal, Greater].choose(&mut rng).unwrap());
+    });
+
    v.sort();
-    for i in 0..v.len() {
+    for i in 0..ORD_VIOLATION_MAX_LEN {
        assert_eq!(v[i], i as i32);
    }

@@ -39,7 +39,6 @@
 mod iter;
 mod raw;
 mod rotate;
-mod select;
 mod specialize;

 #[unstable(feature = "str_internals", issue = "none")]
@@ -83,10 +82,6 @@
 #[unstable(feature = "slice_from_ptr_range", issue = "89792")]
 pub use raw::{from_mut_ptr_range, from_ptr_range};

-// This function is public only because there is no other way to unit test heapsort.
-#[unstable(feature = "sort_internals", reason = "internal to sort module", issue = "none")]
-pub use sort::heapsort;
-
 #[stable(feature = "slice_get_slice", since = "1.28.0")]
 pub use index::SliceIndex;

@@ -2884,21 +2879,26 @@ pub fn binary_search_by_key<'a, B, F>(&'a self, b: &B, mut f: F) -> Result<usize
        self.binary_search_by(|k| f(k).cmp(b))
    }

-    /// Sorts the slice, but might not preserve the order of equal elements.
+    /// Sorts the slice **without** preserving the initial order of equal elements.
    ///
-    /// This sort is unstable (i.e., may reorder equal elements), in-place
-    /// (i.e., does not allocate), and *O*(*n* \* log(*n*)) worst-case.
+    /// This sort is unstable (i.e., may reorder equal elements), in-place (i.e., does not
+    /// allocate), and *O*(*n* \* log(*n*)) worst-case.
+    ///
+    /// If `T: Ord` does not implement a total order the resulting order is unspecified. All
+    /// original elements will remain in the slice and any possible modifications via interior
+    /// mutability are observed in the input. Same is true if `T: Ord` panics.
    ///
    /// # Current implementation
    ///
-    /// The current algorithm is based on [pattern-defeating quicksort][pdqsort] by Orson Peters,
-    /// which combines the fast average case of randomized quicksort with the fast worst case of
-    /// heapsort, while achieving linear time on slices with certain patterns. It uses some
-    /// randomization to avoid degenerate cases, but with a fixed seed to always provide
-    /// deterministic behavior.
+    /// The current implementation is based on [ipnsort] by Lukas Bergdoll and Orson Peters, which
+    /// combines the fast average case of quicksort with the fast worst case of heapsort, achieving
+    /// linear time on fully sorted and reversed inputs. On inputs with k distinct elements, the
+    /// expected time to sort the data is *O(*n* log(*k*))*.
    ///
    /// It is typically faster than stable sorting, except in a few special cases, e.g., when the
-    /// slice consists of several concatenated sorted sequences.
+    /// slice is partially sorted.
+    ///
+    /// If `T: Ord` does not implement a total order, the implementation may panic.
    ///
    /// # Examples
    ///
@@ -2909,25 +2909,29 @@ pub fn binary_search_by_key<'a, B, F>(&'a self, b: &B, mut f: F) -> Result<usize
    /// assert!(v == [-5, -3, 1, 2, 4]);
    /// ```
    ///
-    /// [pdqsort]: https://github.com/orlp/pdqsort
+    /// [ipnsort]: https://github.com/Voultapher/sort-research-rs/tree/main/ipnsort
    #[stable(feature = "sort_unstable", since = "1.20.0")]
    #[inline]
    pub fn sort_unstable(&mut self)
    where
        T: Ord,
    {
-        sort::quicksort(self, T::lt);
+        sort::unstable::sort(self, &mut T::lt);
    }

-    /// Sorts the slice with a comparator function, but might not preserve the order of equal
-    /// elements.
+    /// Sorts the slice with a comparator function, **without** preserving the initial order of
+    /// equal elements.
    ///
-    /// This sort is unstable (i.e., may reorder equal elements), in-place
-    /// (i.e., does not allocate), and *O*(*n* \* log(*n*)) worst-case.
+    /// This sort is unstable (i.e., may reorder equal elements), in-place (i.e., does not
+    /// allocate), and *O*(*n* \* log(*n*)) worst-case.
    ///
-    /// The comparator function must define a total ordering for the elements in the slice. If
-    /// the ordering is not total, the order of the elements is unspecified. An order is a
-    /// total order if it is (for all `a`, `b` and `c`):
+    /// The comparator function should define a total ordering for the elements in the slice. If the
+    /// ordering is not total, the order of the elements is unspecified.
+    ///
+    /// If the comparator function does not implement a total order the resulting order is
+    /// unspecified. All original elements will remain in the slice and any possible modifications
+    /// via interior mutability are observed in the input. Same is true if the comparator function
+    /// panics. A total order (for all `a`, `b` and `c`):
    ///
    /// * total and antisymmetric: exactly one of `a < b`, `a == b` or `a > b` is true, and
    /// * transitive, `a < b` and `b < c` implies `a < c`. The same must hold for both `==` and `>`.
@@ -2943,14 +2947,15 @@ pub fn sort_unstable(&mut self)
    ///
    /// # Current implementation
    ///
-    /// The current algorithm is based on [pattern-defeating quicksort][pdqsort] by Orson Peters,
-    /// which combines the fast average case of randomized quicksort with the fast worst case of
-    /// heapsort, while achieving linear time on slices with certain patterns. It uses some
-    /// randomization to avoid degenerate cases, but with a fixed seed to always provide
-    /// deterministic behavior.
+    /// The current implementation is based on [ipnsort] by Lukas Bergdoll and Orson Peters, which
+    /// combines the fast average case of quicksort with the fast worst case of heapsort, achieving
+    /// linear time on fully sorted and reversed inputs. On inputs with k distinct elements, the
+    /// expected time to sort the data is *O(*n* log(*k*))*.
    ///
    /// It is typically faster than stable sorting, except in a few special cases, e.g., when the
-    /// slice consists of several concatenated sorted sequences.
+    /// slice is partially sorted.
+    ///
+    /// If `T: Ord` does not implement a total order, the implementation may panic.
    ///
    /// # Examples
    ///
@@ -2964,34 +2969,37 @@ pub fn sort_unstable(&mut self)
    /// assert!(v == [5, 4, 3, 2, 1]);
    /// ```
    ///
-    /// [pdqsort]: https://github.com/orlp/pdqsort
+    /// [ipnsort]: https://github.com/Voultapher/sort-research-rs/tree/main/ipnsort
    #[stable(feature = "sort_unstable", since = "1.20.0")]
    #[inline]
    pub fn sort_unstable_by<F>(&mut self, mut compare: F)
    where
        F: FnMut(&T, &T) -> Ordering,
    {
-        sort::quicksort(self, |a, b| compare(a, b) == Ordering::Less);
+        sort::unstable::sort(self, &mut |a, b| compare(a, b) == Ordering::Less);
    }

-    /// Sorts the slice with a key extraction function, but might not preserve the order of equal
-    /// elements.
+    /// Sorts the slice with a key extraction function, **without** preserving the initial order of
+    /// equal elements.
    ///
-    /// This sort is unstable (i.e., may reorder equal elements), in-place
-    /// (i.e., does not allocate), and *O*(*m* \* *n* \* log(*n*)) worst-case, where the key function is
-    /// *O*(*m*).
+    /// This sort is unstable (i.e., may reorder equal elements), in-place (i.e., does not
+    /// allocate), and *O*(*n* \* log(*n*)) worst-case.
+    ///
+    /// If `K: Ord` does not implement a total order the resulting order is unspecified.
+    /// All original elements will remain in the slice and any possible modifications via interior
+    /// mutability are observed in the input. Same is true if `K: Ord` panics.
    ///
    /// # Current implementation
    ///
-    /// The current algorithm is based on [pattern-defeating quicksort][pdqsort] by Orson Peters,
-    /// which combines the fast average case of randomized quicksort with the fast worst case of
-    /// heapsort, while achieving linear time on slices with certain patterns. It uses some
-    /// randomization to avoid degenerate cases, but with a fixed seed to always provide
-    /// deterministic behavior.
+    /// The current implementation is based on [ipnsort] by Lukas Bergdoll and Orson Peters, which
+    /// combines the fast average case of quicksort with the fast worst case of heapsort, achieving
+    /// linear time on fully sorted and reversed inputs. On inputs with k distinct elements, the
+    /// expected time to sort the data is *O(*n* log(*k*))*.
    ///
-    /// Due to its key calling strategy, [`sort_unstable_by_key`](#method.sort_unstable_by_key)
-    /// is likely to be slower than [`sort_by_cached_key`](#method.sort_by_cached_key) in
-    /// cases where the key function is expensive.
+    /// It is typically faster than stable sorting, except in a few special cases, e.g., when the
+    /// slice is partially sorted.
+    ///
+    /// If `K: Ord` does not implement a total order, the implementation may panic.
    ///
    /// # Examples
    ///
@@ -3002,7 +3010,7 @@ pub fn sort_unstable_by<F>(&mut self, mut compare: F)
    /// assert!(v == [1, 2, -3, 4, -5]);
    /// ```
    ///
-    /// [pdqsort]: https://github.com/orlp/pdqsort
+    /// [ipnsort]: https://github.com/Voultapher/sort-research-rs/tree/main/ipnsort
    #[stable(feature = "sort_unstable", since = "1.20.0")]
    #[inline]
    pub fn sort_unstable_by_key<K, F>(&mut self, mut f: F)
@@ -3010,27 +3018,32 @@ pub fn sort_unstable_by_key<K, F>(&mut self, mut f: F)
        F: FnMut(&T) -> K,
        K: Ord,
    {
-        sort::quicksort(self, |a, b| f(a).lt(&f(b)));
+        sort::unstable::sort(self, &mut |a, b| f(a).lt(&f(b)));
    }

-    /// Reorder the slice such that the element at `index` after the reordering is at its final sorted position.
+    /// Reorder the slice such that the element at `index` after the reordering is at its final
+    /// sorted position.
    ///
    /// This reordering has the additional property that any value at position `i < index` will be
    /// less than or equal to any value at a position `j > index`. Additionally, this reordering is
-    /// unstable (i.e. any number of equal elements may end up at position `index`), in-place
-    /// (i.e. does not allocate), and runs in *O*(*n*) time.
-    /// This function is also known as "kth element" in other libraries.
+    /// unstable (i.e. any number of equal elements may end up at position `index`), in-place (i.e.
+    /// does not allocate), and runs in *O*(*n*) time. This function is also known as "kth element"
+    /// in other libraries.
    ///
-    /// It returns a triplet of the following from the reordered slice:
-    /// the subslice prior to `index`, the element at `index`, and the subslice after `index`;
-    /// accordingly, the values in those two subslices will respectively all be less-than-or-equal-to
-    /// and greater-than-or-equal-to the value of the element at `index`.
+    /// It returns a triplet of the following from the reordered slice: the subslice prior to
+    /// `index`, the element at `index`, and the subslice after `index`; accordingly, the values in
+    /// those two subslices will respectively all be less-than-or-equal-to and
+    /// greater-than-or-equal-to the value of the element at `index`.
    ///
    /// # Current implementation
    ///
-    /// The current algorithm is an introselect implementation based on Pattern Defeating Quicksort, which is also
-    /// the basis for [`sort_unstable`]. The fallback algorithm is Median of Medians using Tukey's Ninther for
-    /// pivot selection, which guarantees linear runtime for all inputs.
+    /// The current algorithm is an introselect implementation based on [ipnsort] by Lukas Bergdoll
+    /// and Orson Peters, which is also the basis for [`sort_unstable`]. The fallback algorithm is
+    /// Median of Medians using Tukey's Ninther for pivot selection, which guarantees linear runtime
+    /// for all inputs.
+    ///
+    /// It is typically faster than sorting, except in a few special cases, e.g., when the slice is
+    /// nearly fully sorted, where [`slice::sort`] may be faster.
    ///
    /// [`sort_unstable`]: slice::sort_unstable
    ///
@@ -3058,35 +3071,40 @@ pub fn sort_unstable_by_key<K, F>(&mut self, mut f: F)
    ///         v == [-3, -5, 1, 4, 2] ||
    ///         v == [-5, -3, 1, 4, 2]);
    /// ```
+    ///
+    /// [ipnsort]: https://github.com/Voultapher/sort-research-rs/tree/main/ipnsort
    #[stable(feature = "slice_select_nth_unstable", since = "1.49.0")]
    #[inline]
    pub fn select_nth_unstable(&mut self, index: usize) -> (&mut [T], &mut T, &mut [T])
    where
        T: Ord,
    {
-        select::partition_at_index(self, index, T::lt)
+        sort::select::partition_at_index(self, index, T::lt)
    }

-    /// Reorder the slice with a comparator function such that the element at `index` after the reordering is at
-    /// its final sorted position.
+    /// Reorder the slice with a comparator function such that the element at `index` after the
+    /// reordering is at its final sorted position.
    ///
    /// This reordering has the additional property that any value at position `i < index` will be
    /// less than or equal to any value at a position `j > index` using the comparator function.
    /// Additionally, this reordering is unstable (i.e. any number of equal elements may end up at
-    /// position `index`), in-place (i.e. does not allocate), and runs in *O*(*n*) time.
-    /// This function is also known as "kth element" in other libraries.
+    /// position `index`), in-place (i.e. does not allocate), and runs in *O*(*n*) time. This
+    /// function is also known as "kth element" in other libraries.
    ///
-    /// It returns a triplet of the following from
-    /// the slice reordered according to the provided comparator function: the subslice prior to
-    /// `index`, the element at `index`, and the subslice after `index`; accordingly, the values in
-    /// those two subslices will respectively all be less-than-or-equal-to and greater-than-or-equal-to
-    /// the value of the element at `index`.
+    /// It returns a triplet of the following from the slice reordered according to the provided
+    /// comparator function: the subslice prior to `index`, the element at `index`, and the subslice
+    /// after `index`; accordingly, the values in those two subslices will respectively all be
+    /// less-than-or-equal-to and greater-than-or-equal-to the value of the element at `index`.
    ///
    /// # Current implementation
    ///
-    /// The current algorithm is an introselect implementation based on Pattern Defeating Quicksort, which is also
-    /// the basis for [`sort_unstable`]. The fallback algorithm is Median of Medians using Tukey's Ninther for
-    /// pivot selection, which guarantees linear runtime for all inputs.
+    /// The current algorithm is an introselect implementation based on [ipnsort] by Lukas Bergdoll
+    /// and Orson Peters, which is also the basis for [`sort_unstable`]. The fallback algorithm is
+    /// Median of Medians using Tukey's Ninther for pivot selection, which guarantees linear runtime
+    /// for all inputs.
+    ///
+    /// It is typically faster than sorting, except in a few special cases, e.g., when the slice is
+    /// nearly fully sorted, where [`slice::sort`] may be faster.
    ///
    /// [`sort_unstable`]: slice::sort_unstable
    ///
@@ -3114,6 +3132,8 @@ pub fn select_nth_unstable(&mut self, index: usize) -> (&mut [T], &mut T, &mut [
    ///         v == [4, 2, 1, -5, -3] ||
    ///         v == [4, 2, 1, -3, -5]);
    /// ```
+    ///
+    /// [ipnsort]: https://github.com/Voultapher/sort-research-rs/tree/main/ipnsort
    #[stable(feature = "slice_select_nth_unstable", since = "1.49.0")]
    #[inline]
    pub fn select_nth_unstable_by<F>(
@@ -3124,29 +3144,32 @@ pub fn select_nth_unstable_by<F>(
    where
        F: FnMut(&T, &T) -> Ordering,
    {
-        select::partition_at_index(self, index, |a: &T, b: &T| compare(a, b) == Less)
+        sort::select::partition_at_index(self, index, |a: &T, b: &T| compare(a, b) == Less)
    }

-    /// Reorder the slice with a key extraction function such that the element at `index` after the reordering is
-    /// at its final sorted position.
+    /// Reorder the slice with a key extraction function such that the element at `index` after the
+    /// reordering is at its final sorted position.
    ///
    /// This reordering has the additional property that any value at position `i < index` will be
    /// less than or equal to any value at a position `j > index` using the key extraction function.
    /// Additionally, this reordering is unstable (i.e. any number of equal elements may end up at
-    /// position `index`), in-place (i.e. does not allocate), and runs in *O*(*n*) time.
-    /// This function is also known as "kth element" in other libraries.
+    /// position `index`), in-place (i.e. does not allocate), and runs in *O*(*n*) time. This
+    /// function is also known as "kth element" in other libraries.
    ///
-    /// It returns a triplet of the following from
-    /// the slice reordered according to the provided key extraction function: the subslice prior to
-    /// `index`, the element at `index`, and the subslice after `index`; accordingly, the values in
-    /// those two subslices will respectively all be less-than-or-equal-to and greater-than-or-equal-to
-    /// the value of the element at `index`.
+    /// It returns a triplet of the following from the slice reordered according to the provided key
+    /// extraction function: the subslice prior to `index`, the element at `index`, and the subslice
+    /// after `index`; accordingly, the values in those two subslices will respectively all be
+    /// less-than-or-equal-to and greater-than-or-equal-to the value of the element at `index`.
    ///
    /// # Current implementation
    ///
-    /// The current algorithm is an introselect implementation based on Pattern Defeating Quicksort, which is also
-    /// the basis for [`sort_unstable`]. The fallback algorithm is Median of Medians using Tukey's Ninther for
-    /// pivot selection, which guarantees linear runtime for all inputs.
+    /// The current algorithm is an introselect implementation based on [ipnsort] by Lukas Bergdoll
+    /// and Orson Peters, which is also the basis for [`sort_unstable`]. The fallback algorithm is
+    /// Median of Medians using Tukey's Ninther for pivot selection, which guarantees linear runtime
+    /// for all inputs.
+    ///
+    /// It is typically faster than sorting, except in a few special cases, e.g., when the slice is
+    /// nearly fully sorted, where [`slice::sort`] may be faster.
    ///
    /// [`sort_unstable`]: slice::sort_unstable
    ///
@@ -3174,6 +3197,8 @@ pub fn select_nth_unstable_by<F>(
    ///         v == [2, 1, -3, 4, -5] ||
    ///         v == [2, 1, -3, -5, 4]);
    /// ```
+    ///
+    /// [ipnsort]: https://github.com/Voultapher/sort-research-rs/tree/main/ipnsort
    #[stable(feature = "slice_select_nth_unstable", since = "1.49.0")]
    #[inline]
    pub fn select_nth_unstable_by_key<K, F>(
@@ -3185,7 +3210,7 @@ pub fn select_nth_unstable_by_key<K, F>(
        F: FnMut(&T) -> K,
        K: Ord,
    {
-        select::partition_at_index(self, index, |a: &T, b: &T| f(a).lt(&f(b)))
+        sort::select::partition_at_index(self, index, |a: &T, b: &T| f(a).lt(&f(b)))
    }

    /// Moves all consecutive repeated elements to the end of the slice according to the
@@ -1,1383 +0,0 @@
-//! Slice sorting
-//!
-//! This module contains a sorting algorithm based on Orson Peters' pattern-defeating quicksort,
-//! published at: <https://github.com/orlp/pdqsort>
-//!
-//! Unstable sorting is compatible with core because it doesn't allocate memory, unlike our
-//! stable sorting implementation.
-//!
-//! In addition it also contains the core logic of the stable sort used by `slice::sort` based on
-//! TimSort.
-
-use crate::cmp;
-use crate::mem::{self, MaybeUninit, SizedTypeProperties};
-use crate::ptr;
-
-// When dropped, copies from `src` into `dest`.
-struct InsertionHole<T> {
-    src: *const T,
-    dest: *mut T,
-}
-
-impl<T> Drop for InsertionHole<T> {
-    fn drop(&mut self) {
-        // SAFETY: This is a helper class. Please refer to its usage for correctness. Namely, one
-        // must be sure that `src` and `dst` does not overlap as required by
-        // `ptr::copy_nonoverlapping` and are both valid for writes.
-        unsafe {
-            ptr::copy_nonoverlapping(self.src, self.dest, 1);
-        }
-    }
-}
-
-/// Inserts `v[v.len() - 1]` into pre-sorted sequence `v[..v.len() - 1]` so that whole `v[..]`
-/// becomes sorted.
-unsafe fn insert_tail<T, F>(v: &mut [T], is_less: &mut F)
-where
-    F: FnMut(&T, &T) -> bool,
-{
-    debug_assert!(v.len() >= 2);
-
-    let arr_ptr = v.as_mut_ptr();
-    let i = v.len() - 1;
-
-    // SAFETY: caller must ensure v is at least len 2.
-    unsafe {
-        // See insert_head which talks about why this approach is beneficial.
-        let i_ptr = arr_ptr.add(i);
-
-        // It's important that we use i_ptr here. If this check is positive and we continue,
-        // We want to make sure that no other copy of the value was seen by is_less.
-        // Otherwise we would have to copy it back.
-        if is_less(&*i_ptr, &*i_ptr.sub(1)) {
-            // It's important, that we use tmp for comparison from now on. As it is the value that
-            // will be copied back. And notionally we could have created a divergence if we copy
-            // back the wrong value.
-            let tmp = mem::ManuallyDrop::new(ptr::read(i_ptr));
-            // Intermediate state of the insertion process is always tracked by `hole`, which
-            // serves two purposes:
-            // 1. Protects integrity of `v` from panics in `is_less`.
-            // 2. Fills the remaining hole in `v` in the end.
-            //
-            // Panic safety:
-            //
-            // If `is_less` panics at any point during the process, `hole` will get dropped and
-            // fill the hole in `v` with `tmp`, thus ensuring that `v` still holds every object it
-            // initially held exactly once.
-            let mut hole = InsertionHole { src: &*tmp, dest: i_ptr.sub(1) };
-            ptr::copy_nonoverlapping(hole.dest, i_ptr, 1);
-
-            // SAFETY: We know i is at least 1.
-            for j in (0..(i - 1)).rev() {
-                let j_ptr = arr_ptr.add(j);
-                if !is_less(&*tmp, &*j_ptr) {
-                    break;
-                }
-
-                ptr::copy_nonoverlapping(j_ptr, hole.dest, 1);
-                hole.dest = j_ptr;
-            }
-            // `hole` gets dropped and thus copies `tmp` into the remaining hole in `v`.
-        }
-    }
-}
-
-/// Inserts `v[0]` into pre-sorted sequence `v[1..]` so that whole `v[..]` becomes sorted.
-///
-/// This is the integral subroutine of insertion sort.
-unsafe fn insert_head<T, F>(v: &mut [T], is_less: &mut F)
-where
-    F: FnMut(&T, &T) -> bool,
-{
-    debug_assert!(v.len() >= 2);
-
-    // SAFETY: caller must ensure v is at least len 2.
-    unsafe {
-        if is_less(v.get_unchecked(1), v.get_unchecked(0)) {
-            let arr_ptr = v.as_mut_ptr();
-
-            // There are three ways to implement insertion here:
-            //
-            // 1. Swap adjacent elements until the first one gets to its final destination.
-            //    However, this way we copy data around more than is necessary. If elements are big
-            //    structures (costly to copy), this method will be slow.
-            //
-            // 2. Iterate until the right place for the first element is found. Then shift the
-            //    elements succeeding it to make room for it and finally place it into the
-            //    remaining hole. This is a good method.
-            //
-            // 3. Copy the first element into a temporary variable. Iterate until the right place
-            //    for it is found. As we go along, copy every traversed element into the slot
-            //    preceding it. Finally, copy data from the temporary variable into the remaining
-            //    hole. This method is very good. Benchmarks demonstrated slightly better
-            //    performance than with the 2nd method.
-            //
-            // All methods were benchmarked, and the 3rd showed best results. So we chose that one.
-            let tmp = mem::ManuallyDrop::new(ptr::read(arr_ptr));
-
-            // Intermediate state of the insertion process is always tracked by `hole`, which
-            // serves two purposes:
-            // 1. Protects integrity of `v` from panics in `is_less`.
-            // 2. Fills the remaining hole in `v` in the end.
-            //
-            // Panic safety:
-            //
-            // If `is_less` panics at any point during the process, `hole` will get dropped and
-            // fill the hole in `v` with `tmp`, thus ensuring that `v` still holds every object it
-            // initially held exactly once.
-            let mut hole = InsertionHole { src: &*tmp, dest: arr_ptr.add(1) };
-            ptr::copy_nonoverlapping(arr_ptr.add(1), arr_ptr.add(0), 1);
-
-            for i in 2..v.len() {
-                if !is_less(&v.get_unchecked(i), &*tmp) {
-                    break;
-                }
-                ptr::copy_nonoverlapping(arr_ptr.add(i), arr_ptr.add(i - 1), 1);
-                hole.dest = arr_ptr.add(i);
-            }
-            // `hole` gets dropped and thus copies `tmp` into the remaining hole in `v`.
-        }
-    }
-}
-
-/// Sort `v` assuming `v[..offset]` is already sorted.
-///
-/// Never inline this function to avoid code bloat. It still optimizes nicely and has practically no
-/// performance impact. Even improving performance in some cases.
-#[inline(never)]
-pub(super) fn insertion_sort_shift_left<T, F>(v: &mut [T], offset: usize, is_less: &mut F)
-where
-    F: FnMut(&T, &T) -> bool,
-{
-    let len = v.len();
-
-    // Using assert here improves performance.
-    assert!(offset != 0 && offset <= len);
-
-    // Shift each element of the unsorted region v[i..] as far left as is needed to make v sorted.
-    for i in offset..len {
-        // SAFETY: we tested that `offset` must be at least 1, so this loop is only entered if len
-        // >= 2. The range is exclusive and we know `i` must be at least 1 so this slice has at
-        // >least len 2.
-        unsafe {
-            insert_tail(&mut v[..=i], is_less);
-        }
-    }
-}
-
-/// Sort `v` assuming `v[offset..]` is already sorted.
-///
-/// Never inline this function to avoid code bloat. It still optimizes nicely and has practically no
-/// performance impact. Even improving performance in some cases.
-#[inline(never)]
-fn insertion_sort_shift_right<T, F>(v: &mut [T], offset: usize, is_less: &mut F)
-where
-    F: FnMut(&T, &T) -> bool,
-{
-    let len = v.len();
-
-    // Using assert here improves performance.
-    assert!(offset != 0 && offset <= len && len >= 2);
-
-    // Shift each element of the unsorted region v[..i] as far left as is needed to make v sorted.
-    for i in (0..offset).rev() {
-        // SAFETY: we tested that `offset` must be at least 1, so this loop is only entered if len
-        // >= 2.We ensured that the slice length is always at least 2 long. We know that start_found
-        // will be at least one less than end, and the range is exclusive. Which gives us i always
-        // <= (end - 2).
-        unsafe {
-            insert_head(&mut v[i..len], is_less);
-        }
-    }
-}
-
-/// Partially sorts a slice by shifting several out-of-order elements around.
-///
-/// Returns `true` if the slice is sorted at the end. This function is *O*(*n*) worst-case.
-#[cold]
-fn partial_insertion_sort<T, F>(v: &mut [T], is_less: &mut F) -> bool
-where
-    F: FnMut(&T, &T) -> bool,
-{
-    // Maximum number of adjacent out-of-order pairs that will get shifted.
-    const MAX_STEPS: usize = 5;
-    // If the slice is shorter than this, don't shift any elements.
-    const SHORTEST_SHIFTING: usize = 50;
-
-    let len = v.len();
-    let mut i = 1;
-
-    for _ in 0..MAX_STEPS {
-        // SAFETY: We already explicitly did the bound checking with `i < len`.
-        // All our subsequent indexing is only in the range `0 <= index < len`
-        unsafe {
-            // Find the next pair of adjacent out-of-order elements.
-            while i < len && !is_less(v.get_unchecked(i), v.get_unchecked(i - 1)) {
-                i += 1;
-            }
-        }
-
-        // Are we done?
-        if i == len {
-            return true;
-        }
-
-        // Don't shift elements on short arrays, that has a performance cost.
-        if len < SHORTEST_SHIFTING {
-            return false;
-        }
-
-        // Swap the found pair of elements. This puts them in correct order.
-        v.swap(i - 1, i);
-
-        if i >= 2 {
-            // Shift the smaller element to the left.
-            insertion_sort_shift_left(&mut v[..i], i - 1, is_less);
-
-            // Shift the greater element to the right.
-            insertion_sort_shift_right(&mut v[..i], 1, is_less);
-        }
-    }
-
-    // Didn't manage to sort the slice in the limited number of steps.
-    false
-}
-
-/// Sorts `v` using heapsort, which guarantees *O*(*n* \* log(*n*)) worst-case.
-#[cold]
-#[unstable(feature = "sort_internals", reason = "internal to sort module", issue = "none")]
-pub fn heapsort<T, F>(v: &mut [T], mut is_less: F)
-where
-    F: FnMut(&T, &T) -> bool,
-{
-    // This binary heap respects the invariant `parent >= child`.
-    let mut sift_down = |v: &mut [T], mut node| {
-        loop {
-            // Children of `node`.
-            let mut child = 2 * node + 1;
-            if child >= v.len() {
-                break;
-            }
-
-            // Choose the greater child.
-            if child + 1 < v.len() {
-                // We need a branch to be sure not to out-of-bounds index,
-                // but it's highly predictable.  The comparison, however,
-                // is better done branchless, especially for primitives.
-                child += is_less(&v[child], &v[child + 1]) as usize;
-            }
-
-            // Stop if the invariant holds at `node`.
-            if !is_less(&v[node], &v[child]) {
-                break;
-            }
-
-            // Swap `node` with the greater child, move one step down, and continue sifting.
-            v.swap(node, child);
-            node = child;
-        }
-    };
-
-    // Build the heap in linear time.
-    for i in (0..v.len() / 2).rev() {
-        sift_down(v, i);
-    }
-
-    // Pop maximal elements from the heap.
-    for i in (1..v.len()).rev() {
-        v.swap(0, i);
-        sift_down(&mut v[..i], 0);
-    }
-}
-
-/// Partitions `v` into elements smaller than `pivot`, followed by elements greater than or equal
-/// to `pivot`.
-///
-/// Returns the number of elements smaller than `pivot`.
-///
-/// Partitioning is performed block-by-block in order to minimize the cost of branching operations.
-/// This idea is presented in the [BlockQuicksort][pdf] paper.
-///
-/// [pdf]: https://drops.dagstuhl.de/opus/volltexte/2016/6389/pdf/LIPIcs-ESA-2016-38.pdf
-fn partition_in_blocks<T, F>(v: &mut [T], pivot: &T, is_less: &mut F) -> usize
-where
-    F: FnMut(&T, &T) -> bool,
-{
-    // Number of elements in a typical block.
-    const BLOCK: usize = 128;
-
-    // The partitioning algorithm repeats the following steps until completion:
-    //
-    // 1. Trace a block from the left side to identify elements greater than or equal to the pivot.
-    // 2. Trace a block from the right side to identify elements smaller than the pivot.
-    // 3. Exchange the identified elements between the left and right side.
-    //
-    // We keep the following variables for a block of elements:
-    //
-    // 1. `block` - Number of elements in the block.
-    // 2. `start` - Start pointer into the `offsets` array.
-    // 3. `end` - End pointer into the `offsets` array.
-    // 4. `offsets` - Indices of out-of-order elements within the block.
-
-    // The current block on the left side (from `l` to `l.add(block_l)`).
-    let mut l = v.as_mut_ptr();
-    let mut block_l = BLOCK;
-    let mut start_l = ptr::null_mut();
-    let mut end_l = ptr::null_mut();
-    let mut offsets_l = [MaybeUninit::<u8>::uninit(); BLOCK];
-
-    // The current block on the right side (from `r.sub(block_r)` to `r`).
-    // SAFETY: The documentation for .add() specifically mention that `vec.as_ptr().add(vec.len())` is always safe
-    let mut r = unsafe { l.add(v.len()) };
-    let mut block_r = BLOCK;
-    let mut start_r = ptr::null_mut();
-    let mut end_r = ptr::null_mut();
-    let mut offsets_r = [MaybeUninit::<u8>::uninit(); BLOCK];
-
-    // FIXME: When we get VLAs, try creating one array of length `min(v.len(), 2 * BLOCK)` rather
-    // than two fixed-size arrays of length `BLOCK`. VLAs might be more cache-efficient.
-
-    // Returns the number of elements between pointers `l` (inclusive) and `r` (exclusive).
-    fn width<T>(l: *mut T, r: *mut T) -> usize {
-        assert!(mem::size_of::<T>() > 0);
-        // FIXME: this should *likely* use `offset_from`, but more
-        // investigation is needed (including running tests in miri).
-        (r.addr() - l.addr()) / mem::size_of::<T>()
-    }
-
-    loop {
-        // We are done with partitioning block-by-block when `l` and `r` get very close. Then we do
-        // some patch-up work in order to partition the remaining elements in between.
-        let is_done = width(l, r) <= 2 * BLOCK;
-
-        if is_done {
-            // Number of remaining elements (still not compared to the pivot).
-            let mut rem = width(l, r);
-            if start_l < end_l || start_r < end_r {
-                rem -= BLOCK;
-            }
-
-            // Adjust block sizes so that the left and right block don't overlap, but get perfectly
-            // aligned to cover the whole remaining gap.
-            if start_l < end_l {
-                block_r = rem;
-            } else if start_r < end_r {
-                block_l = rem;
-            } else {
-                // There were the same number of elements to switch on both blocks during the last
-                // iteration, so there are no remaining elements on either block. Cover the remaining
-                // items with roughly equally-sized blocks.
-                block_l = rem / 2;
-                block_r = rem - block_l;
-            }
-            debug_assert!(block_l <= BLOCK && block_r <= BLOCK);
-            debug_assert!(width(l, r) == block_l + block_r);
-        }
-
-        if start_l == end_l {
-            // Trace `block_l` elements from the left side.
-            start_l = MaybeUninit::slice_as_mut_ptr(&mut offsets_l);
-            end_l = start_l;
-            let mut elem = l;
-
-            for i in 0..block_l {
-                // SAFETY: The unsafety operations below involve the usage of the `offset`.
-                //         According to the conditions required by the function, we satisfy them because:
-                //         1. `offsets_l` is stack-allocated, and thus considered separate allocated object.
-                //         2. The function `is_less` returns a `bool`.
-                //            Casting a `bool` will never overflow `isize`.
-                //         3. We have guaranteed that `block_l` will be `<= BLOCK`.
-                //            Plus, `end_l` was initially set to the begin pointer of `offsets_` which was declared on the stack.
-                //            Thus, we know that even in the worst case (all invocations of `is_less` returns false) we will only be at most 1 byte pass the end.
-                //        Another unsafety operation here is dereferencing `elem`.
-                //        However, `elem` was initially the begin pointer to the slice which is always valid.
-                unsafe {
-                    // Branchless comparison.
-                    *end_l = i as u8;
-                    end_l = end_l.add(!is_less(&*elem, pivot) as usize);
-                    elem = elem.add(1);
-                }
-            }
-        }
-
-        if start_r == end_r {
-            // Trace `block_r` elements from the right side.
-            start_r = MaybeUninit::slice_as_mut_ptr(&mut offsets_r);
-            end_r = start_r;
-            let mut elem = r;
-
-            for i in 0..block_r {
-                // SAFETY: The unsafety operations below involve the usage of the `offset`.
-                //         According to the conditions required by the function, we satisfy them because:
-                //         1. `offsets_r` is stack-allocated, and thus considered separate allocated object.
-                //         2. The function `is_less` returns a `bool`.
-                //            Casting a `bool` will never overflow `isize`.
-                //         3. We have guaranteed that `block_r` will be `<= BLOCK`.
-                //            Plus, `end_r` was initially set to the begin pointer of `offsets_` which was declared on the stack.
-                //            Thus, we know that even in the worst case (all invocations of `is_less` returns true) we will only be at most 1 byte pass the end.
-                //        Another unsafety operation here is dereferencing `elem`.
-                //        However, `elem` was initially `1 * sizeof(T)` past the end and we decrement it by `1 * sizeof(T)` before accessing it.
-                //        Plus, `block_r` was asserted to be less than `BLOCK` and `elem` will therefore at most be pointing to the beginning of the slice.
-                unsafe {
-                    // Branchless comparison.
-                    elem = elem.sub(1);
-                    *end_r = i as u8;
-                    end_r = end_r.add(is_less(&*elem, pivot) as usize);
-                }
-            }
-        }
-
-        // Number of out-of-order elements to swap between the left and right side.
-        let count = cmp::min(width(start_l, end_l), width(start_r, end_r));
-
-        if count > 0 {
-            macro_rules! left {
-                () => {
-                    l.add(usize::from(*start_l))
-                };
-            }
-            macro_rules! right {
-                () => {
-                    r.sub(usize::from(*start_r) + 1)
-                };
-            }
-
-            // Instead of swapping one pair at the time, it is more efficient to perform a cyclic
-            // permutation. This is not strictly equivalent to swapping, but produces a similar
-            // result using fewer memory operations.
-
-            // SAFETY: The use of `ptr::read` is valid because there is at least one element in
-            // both `offsets_l` and `offsets_r`, so `left!` is a valid pointer to read from.
-            //
-            // The uses of `left!` involve calls to `offset` on `l`, which points to the
-            // beginning of `v`. All the offsets pointed-to by `start_l` are at most `block_l`, so
-            // these `offset` calls are safe as all reads are within the block. The same argument
-            // applies for the uses of `right!`.
-            //
-            // The calls to `start_l.offset` are valid because there are at most `count-1` of them,
-            // plus the final one at the end of the unsafe block, where `count` is the minimum number
-            // of collected offsets in `offsets_l` and `offsets_r`, so there is no risk of there not
-            // being enough elements. The same reasoning applies to the calls to `start_r.offset`.
-            //
-            // The calls to `copy_nonoverlapping` are safe because `left!` and `right!` are guaranteed
-            // not to overlap, and are valid because of the reasoning above.
-            unsafe {
-                let tmp = ptr::read(left!());
-                ptr::copy_nonoverlapping(right!(), left!(), 1);
-
-                for _ in 1..count {
-                    start_l = start_l.add(1);
-                    ptr::copy_nonoverlapping(left!(), right!(), 1);
-                    start_r = start_r.add(1);
-                    ptr::copy_nonoverlapping(right!(), left!(), 1);
-                }
-
-                ptr::copy_nonoverlapping(&tmp, right!(), 1);
-                mem::forget(tmp);
-                start_l = start_l.add(1);
-                start_r = start_r.add(1);
-            }
-        }
-
-        if start_l == end_l {
-            // All out-of-order elements in the left block were moved. Move to the next block.
-
-            // block-width-guarantee
-            // SAFETY: if `!is_done` then the slice width is guaranteed to be at least `2*BLOCK` wide. There
-            // are at most `BLOCK` elements in `offsets_l` because of its size, so the `offset` operation is
-            // safe. Otherwise, the debug assertions in the `is_done` case guarantee that
-            // `width(l, r) == block_l + block_r`, namely, that the block sizes have been adjusted to account
-            // for the smaller number of remaining elements.
-            l = unsafe { l.add(block_l) };
-        }
-
-        if start_r == end_r {
-            // All out-of-order elements in the right block were moved. Move to the previous block.
-
-            // SAFETY: Same argument as [block-width-guarantee]. Either this is a full block `2*BLOCK`-wide,
-            // or `block_r` has been adjusted for the last handful of elements.
-            r = unsafe { r.sub(block_r) };
-        }
-
-        if is_done {
-            break;
-        }
-    }
-
-    // All that remains now is at most one block (either the left or the right) with out-of-order
-    // elements that need to be moved. Such remaining elements can be simply shifted to the end
-    // within their block.
-
-    if start_l < end_l {
-        // The left block remains.
-        // Move its remaining out-of-order elements to the far right.
-        debug_assert_eq!(width(l, r), block_l);
-        while start_l < end_l {
-            // remaining-elements-safety
-            // SAFETY: while the loop condition holds there are still elements in `offsets_l`, so it
-            // is safe to point `end_l` to the previous element.
-            //
-            // The `ptr::swap` is safe if both its arguments are valid for reads and writes:
-            //  - Per the debug assert above, the distance between `l` and `r` is `block_l`
-            //    elements, so there can be at most `block_l` remaining offsets between `start_l`
-            //    and `end_l`. This means `r` will be moved at most `block_l` steps back, which
-            //    makes the `r.offset` calls valid (at that point `l == r`).
-            //  - `offsets_l` contains valid offsets into `v` collected during the partitioning of
-            //    the last block, so the `l.offset` calls are valid.
-            unsafe {
-                end_l = end_l.sub(1);
-                ptr::swap(l.add(usize::from(*end_l)), r.sub(1));
-                r = r.sub(1);
-            }
-        }
-        width(v.as_mut_ptr(), r)
-    } else if start_r < end_r {
-        // The right block remains.
-        // Move its remaining out-of-order elements to the far left.
-        debug_assert_eq!(width(l, r), block_r);
-        while start_r < end_r {
-            // SAFETY: See the reasoning in [remaining-elements-safety].
-            unsafe {
-                end_r = end_r.sub(1);
-                ptr::swap(l, r.sub(usize::from(*end_r) + 1));
-                l = l.add(1);
-            }
-        }
-        width(v.as_mut_ptr(), l)
-    } else {
-        // Nothing else to do, we're done.
-        width(v.as_mut_ptr(), l)
-    }
-}
-
-/// Partitions `v` into elements smaller than `v[pivot]`, followed by elements greater than or
-/// equal to `v[pivot]`.
-///
-/// Returns a tuple of:
-///
-/// 1. Number of elements smaller than `v[pivot]`.
-/// 2. True if `v` was already partitioned.
-pub(super) fn partition<T, F>(v: &mut [T], pivot: usize, is_less: &mut F) -> (usize, bool)
-where
-    F: FnMut(&T, &T) -> bool,
-{
-    let (mid, was_partitioned) = {
-        // Place the pivot at the beginning of slice.
-        v.swap(0, pivot);
-        let (pivot, v) = v.split_at_mut(1);
-        let pivot = &mut pivot[0];
-
-        // Read the pivot into a stack-allocated variable for efficiency. If a following comparison
-        // operation panics, the pivot will be automatically written back into the slice.
-
-        // SAFETY: `pivot` is a reference to the first element of `v`, so `ptr::read` is safe.
-        let tmp = mem::ManuallyDrop::new(unsafe { ptr::read(pivot) });
-        let _pivot_guard = InsertionHole { src: &*tmp, dest: pivot };
-        let pivot = &*tmp;
-
-        // Find the first pair of out-of-order elements.
-        let mut l = 0;
-        let mut r = v.len();
-
-        // SAFETY: The unsafety below involves indexing an array.
-        // For the first one: We already do the bounds checking here with `l < r`.
-        // For the second one: We initially have `l == 0` and `r == v.len()` and we checked that `l < r` at every indexing operation.
-        //                     From here we know that `r` must be at least `r == l` which was shown to be valid from the first one.
-        unsafe {
-            // Find the first element greater than or equal to the pivot.
-            while l < r && is_less(v.get_unchecked(l), pivot) {
-                l += 1;
-            }
-
-            // Find the last element smaller that the pivot.
-            while l < r && !is_less(v.get_unchecked(r - 1), pivot) {
-                r -= 1;
-            }
-        }
-
-        (l + partition_in_blocks(&mut v[l..r], pivot, is_less), l >= r)
-
-        // `_pivot_guard` goes out of scope and writes the pivot (which is a stack-allocated
-        // variable) back into the slice where it originally was. This step is critical in ensuring
-        // safety!
-    };
-
-    // Place the pivot between the two partitions.
-    v.swap(0, mid);
-
-    (mid, was_partitioned)
-}
-
-/// Partitions `v` into elements equal to `v[pivot]` followed by elements greater than `v[pivot]`.
-///
-/// Returns the number of elements equal to the pivot. It is assumed that `v` does not contain
-/// elements smaller than the pivot.
-pub(super) fn partition_equal<T, F>(v: &mut [T], pivot: usize, is_less: &mut F) -> usize
-where
-    F: FnMut(&T, &T) -> bool,
-{
-    // Place the pivot at the beginning of slice.
-    v.swap(0, pivot);
-    let (pivot, v) = v.split_at_mut(1);
-    let pivot = &mut pivot[0];
-
-    // Read the pivot into a stack-allocated variable for efficiency. If a following comparison
-    // operation panics, the pivot will be automatically written back into the slice.
-    // SAFETY: The pointer here is valid because it is obtained from a reference to a slice.
-    let tmp = mem::ManuallyDrop::new(unsafe { ptr::read(pivot) });
-    let _pivot_guard = InsertionHole { src: &*tmp, dest: pivot };
-    let pivot = &*tmp;
-
-    let len = v.len();
-    if len == 0 {
-        return 0;
-    }
-
-    // Now partition the slice.
-    let mut l = 0;
-    let mut r = len;
-    loop {
-        // SAFETY: The unsafety below involves indexing an array.
-        // For the first one: We already do the bounds checking here with `l < r`.
-        // For the second one: We initially have `l == 0` and `r == v.len()` and we checked that `l < r` at every indexing operation.
-        //                     From here we know that `r` must be at least `r == l` which was shown to be valid from the first one.
-        unsafe {
-            // Find the first element greater than the pivot.
-            while l < r && !is_less(pivot, v.get_unchecked(l)) {
-                l += 1;
-            }
-
-            // Find the last element equal to the pivot.
-            loop {
-                r -= 1;
-                if l >= r || !is_less(pivot, v.get_unchecked(r)) {
-                    break;
-                }
-            }
-
-            // Are we done?
-            if l >= r {
-                break;
-            }
-
-            // Swap the found pair of out-of-order elements.
-            let ptr = v.as_mut_ptr();
-            ptr::swap(ptr.add(l), ptr.add(r));
-            l += 1;
-        }
-    }
-
-    // We found `l` elements equal to the pivot. Add 1 to account for the pivot itself.
-    l + 1
-
-    // `_pivot_guard` goes out of scope and writes the pivot (which is a stack-allocated variable)
-    // back into the slice where it originally was. This step is critical in ensuring safety!
-}
-
-/// Scatters some elements around in an attempt to break patterns that might cause imbalanced
-/// partitions in quicksort.
-#[cold]
-pub(super) fn break_patterns<T>(v: &mut [T]) {
-    let len = v.len();
-    if len >= 8 {
-        let mut seed = len;
-        let mut gen_usize = || {
-            // Pseudorandom number generator from the "Xorshift RNGs" paper by George Marsaglia.
-            if usize::BITS <= 32 {
-                let mut r = seed as u32;
-                r ^= r << 13;
-                r ^= r >> 17;
-                r ^= r << 5;
-                seed = r as usize;
-                seed
-            } else {
-                let mut r = seed as u64;
-                r ^= r << 13;
-                r ^= r >> 7;
-                r ^= r << 17;
-                seed = r as usize;
-                seed
-            }
-        };
-
-        // Take random numbers modulo this number.
-        // The number fits into `usize` because `len` is not greater than `isize::MAX`.
-        let modulus = len.next_power_of_two();
-
-        // Some pivot candidates will be in the nearby of this index. Let's randomize them.
-        let pos = len / 4 * 2;
-
-        for i in 0..3 {
-            // Generate a random number modulo `len`. However, in order to avoid costly operations
-            // we first take it modulo a power of two, and then decrease by `len` until it fits
-            // into the range `[0, len - 1]`.
-            let mut other = gen_usize() & (modulus - 1);
-
-            // `other` is guaranteed to be less than `2 * len`.
-            if other >= len {
-                other -= len;
-            }
-
-            v.swap(pos - 1 + i, other);
-        }
-    }
-}
-
-/// Chooses a pivot in `v` and returns the index and `true` if the slice is likely already sorted.
-///
-/// Elements in `v` might be reordered in the process.
-pub(super) fn choose_pivot<T, F>(v: &mut [T], is_less: &mut F) -> (usize, bool)
-where
-    F: FnMut(&T, &T) -> bool,
-{
-    // Minimum length to choose the median-of-medians method.
-    // Shorter slices use the simple median-of-three method.
-    const SHORTEST_MEDIAN_OF_MEDIANS: usize = 50;
-    // Maximum number of swaps that can be performed in this function.
-    const MAX_SWAPS: usize = 4 * 3;
-
-    let len = v.len();
-
-    // Three indices near which we are going to choose a pivot.
-    let mut a = len / 4 * 1;
-    let mut b = len / 4 * 2;
-    let mut c = len / 4 * 3;
-
-    // Counts the total number of swaps we are about to perform while sorting indices.
-    let mut swaps = 0;
-
-    if len >= 8 {
-        // Swaps indices so that `v[a] <= v[b]`.
-        // SAFETY: `len >= 8` so there are at least two elements in the neighborhoods of
-        // `a`, `b` and `c`. This means the three calls to `sort_adjacent` result in
-        // corresponding calls to `sort3` with valid 3-item neighborhoods around each
-        // pointer, which in turn means the calls to `sort2` are done with valid
-        // references. Thus the `v.get_unchecked` calls are safe, as is the `ptr::swap`
-        // call.
-        let mut sort2 = |a: &mut usize, b: &mut usize| unsafe {
-            if is_less(v.get_unchecked(*b), v.get_unchecked(*a)) {
-                ptr::swap(a, b);
-                swaps += 1;
-            }
-        };
-
-        // Swaps indices so that `v[a] <= v[b] <= v[c]`.
-        let mut sort3 = |a: &mut usize, b: &mut usize, c: &mut usize| {
-            sort2(a, b);
-            sort2(b, c);
-            sort2(a, b);
-        };
-
-        if len >= SHORTEST_MEDIAN_OF_MEDIANS {
-            // Finds the median of `v[a - 1], v[a], v[a + 1]` and stores the index into `a`.
-            let mut sort_adjacent = |a: &mut usize| {
-                let tmp = *a;
-                sort3(&mut (tmp - 1), a, &mut (tmp + 1));
-            };
-
-            // Find medians in the neighborhoods of `a`, `b`, and `c`.
-            sort_adjacent(&mut a);
-            sort_adjacent(&mut b);
-            sort_adjacent(&mut c);
-        }
-
-        // Find the median among `a`, `b`, and `c`.
-        sort3(&mut a, &mut b, &mut c);
-    }
-
-    if swaps < MAX_SWAPS {
-        (b, swaps == 0)
-    } else {
-        // The maximum number of swaps was performed. Chances are the slice is descending or mostly
-        // descending, so reversing will probably help sort it faster.
-        v.reverse();
-        (len - 1 - b, true)
-    }
-}
-
-/// Sorts `v` recursively.
-///
-/// If the slice had a predecessor in the original array, it is specified as `pred`.
-///
-/// `limit` is the number of allowed imbalanced partitions before switching to `heapsort`. If zero,
-/// this function will immediately switch to heapsort.
-fn recurse<'a, T, F>(mut v: &'a mut [T], is_less: &mut F, mut pred: Option<&'a T>, mut limit: u32)
-where
-    F: FnMut(&T, &T) -> bool,
-{
-    // Slices of up to this length get sorted using insertion sort.
-    const MAX_INSERTION: usize = 20;
-
-    // True if the last partitioning was reasonably balanced.
-    let mut was_balanced = true;
-    // True if the last partitioning didn't shuffle elements (the slice was already partitioned).
-    let mut was_partitioned = true;
-
-    loop {
-        let len = v.len();
-
-        // Very short slices get sorted using insertion sort.
-        if len <= MAX_INSERTION {
-            if len >= 2 {
-                insertion_sort_shift_left(v, 1, is_less);
-            }
-            return;
-        }
-
-        // If too many bad pivot choices were made, simply fall back to heapsort in order to
-        // guarantee `O(n * log(n))` worst-case.
-        if limit == 0 {
-            heapsort(v, is_less);
-            return;
-        }
-
-        // If the last partitioning was imbalanced, try breaking patterns in the slice by shuffling
-        // some elements around. Hopefully we'll choose a better pivot this time.
-        if !was_balanced {
-            break_patterns(v);
-            limit -= 1;
-        }
-
-        // Choose a pivot and try guessing whether the slice is already sorted.
-        let (pivot, likely_sorted) = choose_pivot(v, is_less);
-
-        // If the last partitioning was decently balanced and didn't shuffle elements, and if pivot
-        // selection predicts the slice is likely already sorted...
-        if was_balanced && was_partitioned && likely_sorted {
-            // Try identifying several out-of-order elements and shifting them to correct
-            // positions. If the slice ends up being completely sorted, we're done.
-            if partial_insertion_sort(v, is_less) {
-                return;
-            }
-        }
-
-        // If the chosen pivot is equal to the predecessor, then it's the smallest element in the
-        // slice. Partition the slice into elements equal to and elements greater than the pivot.
-        // This case is usually hit when the slice contains many duplicate elements.
-        if let Some(p) = pred {
-            if !is_less(p, &v[pivot]) {
-                let mid = partition_equal(v, pivot, is_less);
-
-                // Continue sorting elements greater than the pivot.
-                v = &mut v[mid..];
-                continue;
-            }
-        }
-
-        // Partition the slice.
-        let (mid, was_p) = partition(v, pivot, is_less);
-        was_balanced = cmp::min(mid, len - mid) >= len / 8;
-        was_partitioned = was_p;
-
-        // Split the slice into `left`, `pivot`, and `right`.
-        let (left, right) = v.split_at_mut(mid);
-        let (pivot, right) = right.split_at_mut(1);
-        let pivot = &pivot[0];
-
-        // Recurse into the shorter side only in order to minimize the total number of recursive
-        // calls and consume less stack space. Then just continue with the longer side (this is
-        // akin to tail recursion).
-        if left.len() < right.len() {
-            recurse(left, is_less, pred, limit);
-            v = right;
-            pred = Some(pivot);
-        } else {
-            recurse(right, is_less, Some(pivot), limit);
-            v = left;
-        }
-    }
-}
-
-/// Sorts `v` using pattern-defeating quicksort, which is *O*(*n* \* log(*n*)) worst-case.
-pub fn quicksort<T, F>(v: &mut [T], mut is_less: F)
-where
-    F: FnMut(&T, &T) -> bool,
-{
-    // Sorting has no meaningful behavior on zero-sized types.
-    if T::IS_ZST {
-        return;
-    }
-
-    // Limit the number of imbalanced partitions to `floor(log2(len)) + 1`.
-    let limit = usize::BITS - v.len().leading_zeros();
-
-    recurse(v, &mut is_less, None, limit);
-}
-
-/// Merges non-decreasing runs `v[..mid]` and `v[mid..]` using `buf` as temporary storage, and
-/// stores the result into `v[..]`.
-///
-/// # Safety
-///
-/// The two slices must be non-empty and `mid` must be in bounds. Buffer `buf` must be long enough
-/// to hold a copy of the shorter slice. Also, `T` must not be a zero-sized type.
-unsafe fn merge<T, F>(v: &mut [T], mid: usize, buf: *mut T, is_less: &mut F)
-where
-    F: FnMut(&T, &T) -> bool,
-{
-    let len = v.len();
-    let v = v.as_mut_ptr();
-
-    // SAFETY: mid and len must be in-bounds of v.
-    let (v_mid, v_end) = unsafe { (v.add(mid), v.add(len)) };
-
-    // The merge process first copies the shorter run into `buf`. Then it traces the newly copied
-    // run and the longer run forwards (or backwards), comparing their next unconsumed elements and
-    // copying the lesser (or greater) one into `v`.
-    //
-    // As soon as the shorter run is fully consumed, the process is done. If the longer run gets
-    // consumed first, then we must copy whatever is left of the shorter run into the remaining
-    // hole in `v`.
-    //
-    // Intermediate state of the process is always tracked by `hole`, which serves two purposes:
-    // 1. Protects integrity of `v` from panics in `is_less`.
-    // 2. Fills the remaining hole in `v` if the longer run gets consumed first.
-    //
-    // Panic safety:
-    //
-    // If `is_less` panics at any point during the process, `hole` will get dropped and fill the
-    // hole in `v` with the unconsumed range in `buf`, thus ensuring that `v` still holds every
-    // object it initially held exactly once.
-    let mut hole;
-
-    if mid <= len - mid {
-        // The left run is shorter.
-
-        // SAFETY: buf must have enough capacity for `v[..mid]`.
-        unsafe {
-            ptr::copy_nonoverlapping(v, buf, mid);
-            hole = MergeHole { start: buf, end: buf.add(mid), dest: v };
-        }
-
-        // Initially, these pointers point to the beginnings of their arrays.
-        let left = &mut hole.start;
-        let mut right = v_mid;
-        let out = &mut hole.dest;
-
-        while *left < hole.end && right < v_end {
-            // Consume the lesser side.
-            // If equal, prefer the left run to maintain stability.
-
-            // SAFETY: left and right must be valid and part of v same for out.
-            unsafe {
-                let is_l = is_less(&*right, &**left);
-                let to_copy = if is_l { right } else { *left };
-                ptr::copy_nonoverlapping(to_copy, *out, 1);
-                *out = out.add(1);
-                right = right.add(is_l as usize);
-                *left = left.add(!is_l as usize);
-            }
-        }
-    } else {
-        // The right run is shorter.
-
-        // SAFETY: buf must have enough capacity for `v[mid..]`.
-        unsafe {
-            ptr::copy_nonoverlapping(v_mid, buf, len - mid);
-            hole = MergeHole { start: buf, end: buf.add(len - mid), dest: v_mid };
-        }
-
-        // Initially, these pointers point past the ends of their arrays.
-        let left = &mut hole.dest;
-        let right = &mut hole.end;
-        let mut out = v_end;
-
-        while v < *left && buf < *right {
-            // Consume the greater side.
-            // If equal, prefer the right run to maintain stability.
-
-            // SAFETY: left and right must be valid and part of v same for out.
-            unsafe {
-                let is_l = is_less(&*right.sub(1), &*left.sub(1));
-                *left = left.sub(is_l as usize);
-                *right = right.sub(!is_l as usize);
-                let to_copy = if is_l { *left } else { *right };
-                out = out.sub(1);
-                ptr::copy_nonoverlapping(to_copy, out, 1);
-            }
-        }
-    }
-    // Finally, `hole` gets dropped. If the shorter run was not fully consumed, whatever remains of
-    // it will now be copied into the hole in `v`.
-
-    // When dropped, copies the range `start..end` into `dest..`.
-    struct MergeHole<T> {
-        start: *mut T,
-        end: *mut T,
-        dest: *mut T,
-    }
-
-    impl<T> Drop for MergeHole<T> {
-        fn drop(&mut self) {
-            // SAFETY: `T` is not a zero-sized type, and these are pointers into a slice's elements.
-            unsafe {
-                let len = self.end.sub_ptr(self.start);
-                ptr::copy_nonoverlapping(self.start, self.dest, len);
-            }
-        }
-    }
-}
-
-/// This merge sort borrows some (but not all) ideas from TimSort, which used to be described in
-/// detail [here](https://github.com/python/cpython/blob/main/Objects/listsort.txt). However Python
-/// has switched to a Powersort based implementation.
-///
-/// The algorithm identifies strictly descending and non-descending subsequences, which are called
-/// natural runs. There is a stack of pending runs yet to be merged. Each newly found run is pushed
-/// onto the stack, and then some pairs of adjacent runs are merged until these two invariants are
-/// satisfied:
-///
-/// 1. for every `i` in `1..runs.len()`: `runs[i - 1].len > runs[i].len`
-/// 2. for every `i` in `2..runs.len()`: `runs[i - 2].len > runs[i - 1].len + runs[i].len`
-///
-/// The invariants ensure that the total running time is *O*(*n* \* log(*n*)) worst-case.
-pub fn merge_sort<T, CmpF, ElemAllocF, ElemDeallocF, RunAllocF, RunDeallocF>(
-    v: &mut [T],
-    is_less: &mut CmpF,
-    elem_alloc_fn: ElemAllocF,
-    elem_dealloc_fn: ElemDeallocF,
-    run_alloc_fn: RunAllocF,
-    run_dealloc_fn: RunDeallocF,
-) where
-    CmpF: FnMut(&T, &T) -> bool,
-    ElemAllocF: Fn(usize) -> *mut T,
-    ElemDeallocF: Fn(*mut T, usize),
-    RunAllocF: Fn(usize) -> *mut TimSortRun,
-    RunDeallocF: Fn(*mut TimSortRun, usize),
-{
-    // Slices of up to this length get sorted using insertion sort.
-    const MAX_INSERTION: usize = 20;
-
-    // The caller should have already checked that.
-    debug_assert!(!T::IS_ZST);
-
-    let len = v.len();
-
-    // Short arrays get sorted in-place via insertion sort to avoid allocations.
-    if len <= MAX_INSERTION {
-        if len >= 2 {
-            insertion_sort_shift_left(v, 1, is_less);
-        }
-        return;
-    }
-
-    // Allocate a buffer to use as scratch memory. We keep the length 0 so we can keep in it
-    // shallow copies of the contents of `v` without risking the dtors running on copies if
-    // `is_less` panics. When merging two sorted runs, this buffer holds a copy of the shorter run,
-    // which will always have length at most `len / 2`.
-    let buf = BufGuard::new(len / 2, elem_alloc_fn, elem_dealloc_fn);
-    let buf_ptr = buf.buf_ptr.as_ptr();
-
-    let mut runs = RunVec::new(run_alloc_fn, run_dealloc_fn);
-
-    let mut end = 0;
-    let mut start = 0;
-
-    // Scan forward. Memory pre-fetching prefers forward scanning vs backwards scanning, and the
-    // code-gen is usually better. For the most sensitive types such as integers, these are merged
-    // bidirectionally at once. So there is no benefit in scanning backwards.
-    while end < len {
-        let (streak_end, was_reversed) = find_streak(&v[start..], is_less);
-        end += streak_end;
-        if was_reversed {
-            v[start..end].reverse();
-        }
-
-        // Insert some more elements into the run if it's too short. Insertion sort is faster than
-        // merge sort on short sequences, so this significantly improves performance.
-        end = provide_sorted_batch(v, start, end, is_less);
-
-        // Push this run onto the stack.
-        runs.push(TimSortRun { start, len: end - start });
-        start = end;
-
-        // Merge some pairs of adjacent runs to satisfy the invariants.
-        while let Some(r) = collapse(runs.as_slice(), len) {
-            let left = runs[r];
-            let right = runs[r + 1];
-            let merge_slice = &mut v[left.start..right.start + right.len];
-            // SAFETY: `buf_ptr` must hold enough capacity for the shorter of the two sides, and
-            // neither side may be on length 0.
-            unsafe {
-                merge(merge_slice, left.len, buf_ptr, is_less);
-            }
-            runs[r + 1] = TimSortRun { start: left.start, len: left.len + right.len };
-            runs.remove(r);
-        }
-    }
-
-    // Finally, exactly one run must remain in the stack.
-    debug_assert!(runs.len() == 1 && runs[0].start == 0 && runs[0].len == len);
-
-    // Examines the stack of runs and identifies the next pair of runs to merge. More specifically,
-    // if `Some(r)` is returned, that means `runs[r]` and `runs[r + 1]` must be merged next. If the
-    // algorithm should continue building a new run instead, `None` is returned.
-    //
-    // TimSort is infamous for its buggy implementations, as described here:
-    // http://envisage-project.eu/timsort-specification-and-verification/
-    //
-    // The gist of the story is: we must enforce the invariants on the top four runs on the stack.
-    // Enforcing them on just top three is not sufficient to ensure that the invariants will still
-    // hold for *all* runs in the stack.
-    //
-    // This function correctly checks invariants for the top four runs. Additionally, if the top
-    // run starts at index 0, it will always demand a merge operation until the stack is fully
-    // collapsed, in order to complete the sort.
-    #[inline]
-    fn collapse(runs: &[TimSortRun], stop: usize) -> Option<usize> {
-        let n = runs.len();
-        if n >= 2
-            && (runs[n - 1].start + runs[n - 1].len == stop
-                || runs[n - 2].len <= runs[n - 1].len
-                || (n >= 3 && runs[n - 3].len <= runs[n - 2].len + runs[n - 1].len)
-                || (n >= 4 && runs[n - 4].len <= runs[n - 3].len + runs[n - 2].len))
-        {
-            if n >= 3 && runs[n - 3].len < runs[n - 1].len { Some(n - 3) } else { Some(n - 2) }
-        } else {
-            None
-        }
-    }
-
-    // Extremely basic versions of Vec.
-    // Their use is super limited and by having the code here, it allows reuse between the sort
-    // implementations.
-    struct BufGuard<T, ElemDeallocF>
-    where
-        ElemDeallocF: Fn(*mut T, usize),
-    {
-        buf_ptr: ptr::NonNull<T>,
-        capacity: usize,
-        elem_dealloc_fn: ElemDeallocF,
-    }
-
-    impl<T, ElemDeallocF> BufGuard<T, ElemDeallocF>
-    where
-        ElemDeallocF: Fn(*mut T, usize),
-    {
-        fn new<ElemAllocF>(
-            len: usize,
-            elem_alloc_fn: ElemAllocF,
-            elem_dealloc_fn: ElemDeallocF,
-        ) -> Self
-        where
-            ElemAllocF: Fn(usize) -> *mut T,
-        {
-            Self {
-                buf_ptr: ptr::NonNull::new(elem_alloc_fn(len)).unwrap(),
-                capacity: len,
-                elem_dealloc_fn,
-            }
-        }
-    }
-
-    impl<T, ElemDeallocF> Drop for BufGuard<T, ElemDeallocF>
-    where
-        ElemDeallocF: Fn(*mut T, usize),
-    {
-        fn drop(&mut self) {
-            (self.elem_dealloc_fn)(self.buf_ptr.as_ptr(), self.capacity);
-        }
-    }
-
-    struct RunVec<RunAllocF, RunDeallocF>
-    where
-        RunAllocF: Fn(usize) -> *mut TimSortRun,
-        RunDeallocF: Fn(*mut TimSortRun, usize),
-    {
-        buf_ptr: ptr::NonNull<TimSortRun>,
-        capacity: usize,
-        len: usize,
-        run_alloc_fn: RunAllocF,
-        run_dealloc_fn: RunDeallocF,
-    }
-
-    impl<RunAllocF, RunDeallocF> RunVec<RunAllocF, RunDeallocF>
-    where
-        RunAllocF: Fn(usize) -> *mut TimSortRun,
-        RunDeallocF: Fn(*mut TimSortRun, usize),
-    {
-        fn new(run_alloc_fn: RunAllocF, run_dealloc_fn: RunDeallocF) -> Self {
-            // Most slices can be sorted with at most 16 runs in-flight.
-            const START_RUN_CAPACITY: usize = 16;
-
-            Self {
-                buf_ptr: ptr::NonNull::new(run_alloc_fn(START_RUN_CAPACITY)).unwrap(),
-                capacity: START_RUN_CAPACITY,
-                len: 0,
-                run_alloc_fn,
-                run_dealloc_fn,
-            }
-        }
-
-        fn push(&mut self, val: TimSortRun) {
-            if self.len == self.capacity {
-                let old_capacity = self.capacity;
-                let old_buf_ptr = self.buf_ptr.as_ptr();
-
-                self.capacity = self.capacity * 2;
-                self.buf_ptr = ptr::NonNull::new((self.run_alloc_fn)(self.capacity)).unwrap();
-
-                // SAFETY: buf_ptr new and old were correctly allocated and old_buf_ptr has
-                // old_capacity valid elements.
-                unsafe {
-                    ptr::copy_nonoverlapping(old_buf_ptr, self.buf_ptr.as_ptr(), old_capacity);
-                }
-
-                (self.run_dealloc_fn)(old_buf_ptr, old_capacity);
-            }
-
-            // SAFETY: The invariant was just checked.
-            unsafe {
-                self.buf_ptr.as_ptr().add(self.len).write(val);
-            }
-            self.len += 1;
-        }
-
-        fn remove(&mut self, index: usize) {
-            if index >= self.len {
-                panic!("Index out of bounds");
-            }
-
-            // SAFETY: buf_ptr needs to be valid and len invariant upheld.
-            unsafe {
-                // the place we are taking from.
-                let ptr = self.buf_ptr.as_ptr().add(index);
-
-                // Shift everything down to fill in that spot.
-                ptr::copy(ptr.add(1), ptr, self.len - index - 1);
-            }
-            self.len -= 1;
-        }
-
-        fn as_slice(&self) -> &[TimSortRun] {
-            // SAFETY: Safe as long as buf_ptr is valid and len invariant was upheld.
-            unsafe { &*ptr::slice_from_raw_parts(self.buf_ptr.as_ptr(), self.len) }
-        }
-
-        fn len(&self) -> usize {
-            self.len
-        }
-    }
-
-    impl<RunAllocF, RunDeallocF> core::ops::Index<usize> for RunVec<RunAllocF, RunDeallocF>
-    where
-        RunAllocF: Fn(usize) -> *mut TimSortRun,
-        RunDeallocF: Fn(*mut TimSortRun, usize),
-    {
-        type Output = TimSortRun;
-
-        fn index(&self, index: usize) -> &Self::Output {
-            if index < self.len {
-                // SAFETY: buf_ptr and len invariant must be upheld.
-                unsafe {
-                    return &*(self.buf_ptr.as_ptr().add(index));
-                }
-            }
-
-            panic!("Index out of bounds");
-        }
-    }
-
-    impl<RunAllocF, RunDeallocF> core::ops::IndexMut<usize> for RunVec<RunAllocF, RunDeallocF>
-    where
-        RunAllocF: Fn(usize) -> *mut TimSortRun,
-        RunDeallocF: Fn(*mut TimSortRun, usize),
-    {
-        fn index_mut(&mut self, index: usize) -> &mut Self::Output {
-            if index < self.len {
-                // SAFETY: buf_ptr and len invariant must be upheld.
-                unsafe {
-                    return &mut *(self.buf_ptr.as_ptr().add(index));
-                }
-            }
-
-            panic!("Index out of bounds");
-        }
-    }
-
-    impl<RunAllocF, RunDeallocF> Drop for RunVec<RunAllocF, RunDeallocF>
-    where
-        RunAllocF: Fn(usize) -> *mut TimSortRun,
-        RunDeallocF: Fn(*mut TimSortRun, usize),
-    {
-        fn drop(&mut self) {
-            // As long as TimSortRun is Copy we don't need to drop them individually but just the
-            // whole allocation.
-            (self.run_dealloc_fn)(self.buf_ptr.as_ptr(), self.capacity);
-        }
-    }
-}
-
-/// Internal type used by merge_sort.
-#[derive(Clone, Copy, Debug)]
-pub struct TimSortRun {
-    len: usize,
-    start: usize,
-}
-
-/// Takes a range as denoted by start and end, that is already sorted and extends it to the right if
-/// necessary with sorts optimized for smaller ranges such as insertion sort.
-fn provide_sorted_batch<T, F>(v: &mut [T], start: usize, mut end: usize, is_less: &mut F) -> usize
-where
-    F: FnMut(&T, &T) -> bool,
-{
-    let len = v.len();
-    assert!(end >= start && end <= len);
-
-    // This value is a balance between least comparisons and best performance, as
-    // influenced by for example cache locality.
-    const MIN_INSERTION_RUN: usize = 10;
-
-    // Insert some more elements into the run if it's too short. Insertion sort is faster than
-    // merge sort on short sequences, so this significantly improves performance.
-    let start_end_diff = end - start;
-
-    if start_end_diff < MIN_INSERTION_RUN && end < len {
-        // v[start_found..end] are elements that are already sorted in the input. We want to extend
-        // the sorted region to the left, so we push up MIN_INSERTION_RUN - 1 to the right. Which is
-        // more efficient that trying to push those already sorted elements to the left.
-        end = cmp::min(start + MIN_INSERTION_RUN, len);
-        let presorted_start = cmp::max(start_end_diff, 1);
-
-        insertion_sort_shift_left(&mut v[start..end], presorted_start, is_less);
-    }
-
-    end
-}
-
-/// Finds a streak of presorted elements starting at the beginning of the slice. Returns the first
-/// value that is not part of said streak, and a bool denoting whether the streak was reversed.
-/// Streaks can be increasing or decreasing.
-fn find_streak<T, F>(v: &[T], is_less: &mut F) -> (usize, bool)
-where
-    F: FnMut(&T, &T) -> bool,
-{
-    let len = v.len();
-
-    if len < 2 {
-        return (len, false);
-    }
-
-    let mut end = 2;
-
-    // SAFETY: See below specific.
-    unsafe {
-        // SAFETY: We checked that len >= 2, so 0 and 1 are valid indices.
-        let assume_reverse = is_less(v.get_unchecked(1), v.get_unchecked(0));
-
-        // SAFETY: We know end >= 2 and check end < len.
-        // From that follows that accessing v at end and end - 1 is safe.
-        if assume_reverse {
-            while end < len && is_less(v.get_unchecked(end), v.get_unchecked(end - 1)) {
-                end += 1;
-            }
-
-            (end, true)
-        } else {
-            while end < len && !is_less(v.get_unchecked(end), v.get_unchecked(end - 1)) {
-                end += 1;
-            }
-            (end, false)
-        }
-    }
-}
@@ -0,0 +1,8 @@
+//! This module and the contained sub-modules contains the code for efficient and robust sort
+//! implementations, as well as the domain adjacent implementation of `select_nth_unstable`.
+
+pub mod stable;
+pub mod unstable;
+
+pub(crate) mod select;
+pub(crate) mod shared;
@@ -1,45 +1,78 @@
-//! Slice selection
-//!
 //! This module contains the implementation for `slice::select_nth_unstable`.
-//! It uses an introselect algorithm based on Orson Peters' pattern-defeating quicksort,
-//! published at: <https://github.com/orlp/pdqsort>
+//! It uses an introselect algorithm based on ipnsort by Lukas Bergdoll and Orson Peters,
+//! published at: <https://github.com/Voultapher/sort-research-rs/tree/main/ipnsort>
 //!
 //! The fallback algorithm used for introselect is Median of Medians using Tukey's Ninther
 //! for pivot selection. Using this as a fallback ensures O(n) worst case running time with
 //! better performance than one would get using heapsort as fallback.

-use crate::cmp;
 use crate::mem::{self, SizedTypeProperties};
-use crate::slice::sort::{
-    break_patterns, choose_pivot, insertion_sort_shift_left, partition, partition_equal,
-};

-// For slices of up to this length it's probably faster to simply sort them.
-// Defined at the module scope because it's used in multiple functions.
-const MAX_INSERTION: usize = 10;
+use crate::slice::sort::shared::pivot::choose_pivot;
+use crate::slice::sort::shared::smallsort::insertion_sort_shift_left;
+use crate::slice::sort::unstable::quicksort::partition;
+
+/// Reorder the slice such that the element at `index` is at its final sorted position.
+pub(crate) fn partition_at_index<T, F>(
+    v: &mut [T],
+    index: usize,
+    mut is_less: F,
+) -> (&mut [T], &mut T, &mut [T])
+where
+    F: FnMut(&T, &T) -> bool,
+{
+    let len = v.len();
+
+    // Puts a lower limit of 1 on `len`.
+    if index >= len {
+        panic!("partition_at_index index {} greater than length of slice {}", index, len);
+    }
+
+    if T::IS_ZST {
+        // Sorting has no meaningful behavior on zero-sized types. Do nothing.
+    } else if index == len - 1 {
+        // Find max element and place it in the last position of the array. We're free to use
+        // `unwrap()` here because we checked that `v` is not empty.
+        let max_idx = max_index(v, &mut is_less).unwrap();
+        v.swap(max_idx, index);
+    } else if index == 0 {
+        // Find min element and place it in the first position of the array. We're free to use
+        // `unwrap()` here because we checked that `v` is not empty.
+        let min_idx = min_index(v, &mut is_less).unwrap();
+        v.swap(min_idx, index);
+    } else {
+        partition_at_index_loop(v, index, None, &mut is_less);
+    }
+
+    let (left, right) = v.split_at_mut(index);
+    let (pivot, right) = right.split_at_mut(1);
+    let pivot = &mut pivot[0];
+    (left, pivot, right)
+}
+
+// For small sub-slices it's faster to use a dedicated small-sort, but because it is only called at
+// most once, it doesn't make sense to use something more sophisticated than insertion-sort.
+const INSERTION_SORT_THRESHOLD: usize = 16;

 fn partition_at_index_loop<'a, T, F>(
    mut v: &'a mut [T],
    mut index: usize,
+    mut ancestor_pivot: Option<&'a T>,
    is_less: &mut F,
-    mut pred: Option<&'a T>,
 ) where
    F: FnMut(&T, &T) -> bool,
 {
-    // Limit the amount of iterations and fall back to fast deterministic selection
-    // to ensure O(n) worst case running time. This limit needs to be constant, because
-    // using `ilog2(len)` like in `sort` would result in O(n log n) time complexity.
-    // The exact value of the limit is chosen somewhat arbitrarily, but for most inputs bad pivot
-    // selections should be relatively rare, so the limit usually shouldn't be reached
-    // anyways.
+    // Limit the amount of iterations and fall back to fast deterministic selection to ensure O(n)
+    // worst case running time. This limit needs to be constant, because using `ilog2(len)` like in
+    // `sort` would result in O(n log n) time complexity. The exact value of the limit is chosen
+    // somewhat arbitrarily, but for most inputs bad pivot selections should be relatively rare, so
+    // the limit is reached for sub-slices len / (2^limit or less). Which makes the remaining work
+    // with the fallback minimal in relative terms.
    let mut limit = 16;

-    // True if the last partitioning was reasonably balanced.
-    let mut was_balanced = true;
-
    loop {
-        if v.len() <= MAX_INSERTION {
-            if v.len() > 1 {
+        if v.len() <= INSERTION_SORT_THRESHOLD {
+            if v.len() >= 2 {
                insertion_sort_shift_left(v, 1, is_less);
            }
            return;
@@ -50,38 +83,35 @@ fn partition_at_index_loop<'a, T, F>(
            return;
        }

-        // If the last partitioning was imbalanced, try breaking patterns in the slice by shuffling
-        // some elements around. Hopefully we'll choose a better pivot this time.
-        if !was_balanced {
-            break_patterns(v);
-            limit -= 1;
-        }
+        limit -= 1;

        // Choose a pivot
-        let (pivot, _) = choose_pivot(v, is_less);
+        let pivot_pos = choose_pivot(v, is_less);

        // If the chosen pivot is equal to the predecessor, then it's the smallest element in the
        // slice. Partition the slice into elements equal to and elements greater than the pivot.
        // This case is usually hit when the slice contains many duplicate elements.
-        if let Some(p) = pred {
-            if !is_less(p, &v[pivot]) {
-                let mid = partition_equal(v, pivot, is_less);
+        if let Some(p) = ancestor_pivot {
+            if !is_less(p, unsafe { v.get_unchecked(pivot_pos) }) {
+                let num_lt = partition(v, pivot_pos, &mut |a, b| !is_less(b, a));
+
+                // Continue sorting elements greater than the pivot. We know that `mid` contains
+                // the pivot. So we can continue after `mid`.
+                let mid = num_lt + 1;

                // If we've passed our index, then we're good.
                if mid > index {
                    return;
                }

-                // Otherwise, continue sorting elements greater than the pivot.
                v = &mut v[mid..];
                index = index - mid;
-                pred = None;
+                ancestor_pivot = None;
                continue;
            }
        }

-        let (mid, _) = partition(v, pivot, is_less);
-        was_balanced = cmp::min(mid, v.len() - mid) >= v.len() / 8;
+        let mid = partition(v, pivot_pos, is_less);

        // Split the slice into `left`, `pivot`, and `right`.
        let (left, right) = v.split_at_mut(mid);
@@ -91,7 +121,7 @@ fn partition_at_index_loop<'a, T, F>(
        if mid < index {
            v = right;
            index = index - mid - 1;
-            pred = Some(pivot);
+            ancestor_pivot = Some(pivot);
        } else if mid > index {
            v = left;
        } else {
@@ -122,41 +152,6 @@ fn max_index<T, F: FnMut(&T, &T) -> bool>(slice: &[T], is_less: &mut F) -> Optio
        .map(|(i, _)| i)
 }

-/// Reorder the slice such that the element at `index` is at its final sorted position.
-pub fn partition_at_index<T, F>(
-    v: &mut [T],
-    index: usize,
-    mut is_less: F,
-) -> (&mut [T], &mut T, &mut [T])
-where
-    F: FnMut(&T, &T) -> bool,
-{
-    if index >= v.len() {
-        panic!("partition_at_index index {} greater than length of slice {}", index, v.len());
-    }
-
-    if T::IS_ZST {
-        // Sorting has no meaningful behavior on zero-sized types. Do nothing.
-    } else if index == v.len() - 1 {
-        // Find max element and place it in the last position of the array. We're free to use
-        // `unwrap()` here because we know v must not be empty.
-        let max_idx = max_index(v, &mut is_less).unwrap();
-        v.swap(max_idx, index);
-    } else if index == 0 {
-        // Find min element and place it in the first position of the array. We're free to use
-        // `unwrap()` here because we know v must not be empty.
-        let min_idx = min_index(v, &mut is_less).unwrap();
-        v.swap(min_idx, index);
-    } else {
-        partition_at_index_loop(v, index, &mut is_less, None);
-    }
-
-    let (left, right) = v.split_at_mut(index);
-    let (pivot, right) = right.split_at_mut(1);
-    let pivot = &mut pivot[0];
-    (left, pivot, right)
-}
-
 /// Selection algorithm to select the k-th element from the slice in guaranteed O(n) time.
 /// This is essentially a quickselect that uses Tukey's Ninther for pivot selection
 fn median_of_medians<T, F: FnMut(&T, &T) -> bool>(mut v: &mut [T], is_less: &mut F, mut k: usize) {
@@ -168,8 +163,8 @@ fn median_of_medians<T, F: FnMut(&T, &T) -> bool>(mut v: &mut [T], is_less: &mut

    // We now know that `k < v.len() <= isize::MAX`
    loop {
-        if v.len() <= MAX_INSERTION {
-            if v.len() > 1 {
+        if v.len() <= INSERTION_SORT_THRESHOLD {
+            if v.len() >= 2 {
                insertion_sort_shift_left(v, 1, is_less);
            }
            return;
@@ -232,7 +227,8 @@ fn median_of_ninthers<T, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F)
    }

    median_of_medians(&mut v[lo..lo + frac], is_less, pivot);
-    partition(v, lo + pivot, is_less).0
+
+    partition(v, lo + pivot, is_less)
 }

 /// Moves around the 9 elements at the indices a..i, such that
@@ -0,0 +1,45 @@
+use crate::marker::Freeze;
+
+pub(crate) mod pivot;
+pub(crate) mod smallsort;
+
+/// SAFETY: this is safety relevant, how does this interact with the soundness holes in
+/// specialization?
+#[rustc_unsafe_specialization_marker]
+pub(crate) trait FreezeMarker {}
+
+impl<T: Freeze> FreezeMarker for T {}
+
+/// Finds a run of sorted elements starting at the beginning of the slice.
+///
+/// Returns the length of the run, and a bool that is false when the run
+/// is ascending, and true if the run strictly descending.
+#[inline(always)]
+pub(crate) fn find_existing_run<T, F: FnMut(&T, &T) -> bool>(
+    v: &[T],
+    is_less: &mut F,
+) -> (usize, bool) {
+    let len = v.len();
+    if len < 2 {
+        return (len, false);
+    }
+
+    // SAFETY: We checked that len >= 2, so 0 and 1 are valid indices.
+    // This also means that run_len < len implies run_len and run_len - 1
+    // are valid indices as well.
+    unsafe {
+        let mut run_len = 2;
+        let strictly_descending = is_less(v.get_unchecked(1), v.get_unchecked(0));
+        if strictly_descending {
+            while run_len < len && is_less(v.get_unchecked(run_len), v.get_unchecked(run_len - 1)) {
+                run_len += 1;
+            }
+        } else {
+            while run_len < len && !is_less(v.get_unchecked(run_len), v.get_unchecked(run_len - 1))
+            {
+                run_len += 1;
+            }
+        }
+        (run_len, strictly_descending)
+    }
+}
@@ -0,0 +1,88 @@
+//! This module contains the logic for pivot selection.
+
+use crate::intrinsics;
+
+// Recursively select a pseudomedian if above this threshold.
+const PSEUDO_MEDIAN_REC_THRESHOLD: usize = 64;
+
+/// Selects a pivot from `v`. Algorithm taken from glidesort by Orson Peters.
+///
+/// This chooses a pivot by sampling an adaptive amount of points, approximating
+/// the quality of a median of sqrt(n) elements.
+pub fn choose_pivot<T, F: FnMut(&T, &T) -> bool>(v: &[T], is_less: &mut F) -> usize {
+    // We use unsafe code and raw pointers here because we're dealing with
+    // heavy recursion. Passing safe slices around would involve a lot of
+    // branches and function call overhead.
+
+    let len = v.len();
+    if len < 8 {
+        intrinsics::abort();
+    }
+
+    // SAFETY: a, b, c point to initialized regions of len_div_8 elements,
+    // satisfying median3 and median3_rec's preconditions as v_base points
+    // to an initialized region of n = len elements.
+    unsafe {
+        let v_base = v.as_ptr();
+        let len_div_8 = len / 8;
+
+        let a = v_base; // [0, floor(n/8))
+        let b = v_base.add(len_div_8 * 4); // [4*floor(n/8), 5*floor(n/8))
+        let c = v_base.add(len_div_8 * 7); // [7*floor(n/8), 8*floor(n/8))
+
+        if len < PSEUDO_MEDIAN_REC_THRESHOLD {
+            median3(&*a, &*b, &*c, is_less).sub_ptr(v_base)
+        } else {
+            median3_rec(a, b, c, len_div_8, is_less).sub_ptr(v_base)
+        }
+    }
+}
+
+/// Calculates an approximate median of 3 elements from sections a, b, c, or
+/// recursively from an approximation of each, if they're large enough. By
+/// dividing the size of each section by 8 when recursing we have logarithmic
+/// recursion depth and overall sample from f(n) = 3*f(n/8) -> f(n) =
+/// O(n^(log(3)/log(8))) ~= O(n^0.528) elements.
+///
+/// SAFETY: a, b, c must point to the start of initialized regions of memory of
+/// at least n elements.
+unsafe fn median3_rec<T, F: FnMut(&T, &T) -> bool>(
+    mut a: *const T,
+    mut b: *const T,
+    mut c: *const T,
+    n: usize,
+    is_less: &mut F,
+) -> *const T {
+    // SAFETY: a, b, c still point to initialized regions of n / 8 elements,
+    // by the exact same logic as in choose_pivot.
+    unsafe {
+        if n * 8 >= PSEUDO_MEDIAN_REC_THRESHOLD {
+            let n8 = n / 8;
+            a = median3_rec(a, a.add(n8 * 4), a.add(n8 * 7), n8, is_less);
+            b = median3_rec(b, b.add(n8 * 4), b.add(n8 * 7), n8, is_less);
+            c = median3_rec(c, c.add(n8 * 4), c.add(n8 * 7), n8, is_less);
+        }
+        median3(&*a, &*b, &*c, is_less)
+    }
+}
+
+/// Calculates the median of 3 elements.
+///
+/// SAFETY: a, b, c must be valid initialized elements.
+#[inline(always)]
+fn median3<T, F: FnMut(&T, &T) -> bool>(a: &T, b: &T, c: &T, is_less: &mut F) -> *const T {
+    // Compiler tends to make this branchless when sensible, and avoids the
+    // third comparison when not.
+    let x = is_less(a, b);
+    let y = is_less(a, c);
+    if x == y {
+        // If x=y=0 then b, c <= a. In this case we want to return max(b, c).
+        // If x=y=1 then a < b, c. In this case we want to return min(b, c).
+        // By toggling the outcome of b < c using XOR x we get this behavior.
+        let z = is_less(b, c);
+        if z ^ x { c } else { b }
+    } else {
+        // Either c <= a < b or b <= a < c, thus a is our median.
+        a
+    }
+}
@@ -0,0 +1,843 @@
+//! This module contains a variety of sort implementations that are optimized for small lengths.
+
+use crate::intrinsics;
+use crate::mem::{self, ManuallyDrop, MaybeUninit};
+use crate::ptr;
+use crate::slice;
+
+use crate::slice::sort::shared::FreezeMarker;
+
+// It's important to differentiate between SMALL_SORT_THRESHOLD performance for
+// small slices and small-sort performance sorting small sub-slices as part of
+// the main quicksort loop. For the former, testing showed that the
+// representative benchmarks for real-world performance are cold CPU state and
+// not single-size hot benchmarks. For the latter the CPU will call them many
+// times, so hot benchmarks are fine and more realistic. And it's worth it to
+// optimize sorting small sub-slices with more sophisticated solutions than
+// insertion sort.
+
+/// Using a trait allows us to specialize on `Freeze` which in turn allows us to make safe
+/// abstractions.
+pub(crate) trait StableSmallSortTypeImpl: Sized {
+    /// For which input length <= return value of this function, is it valid to call `small_sort`.
+    fn small_sort_threshold() -> usize;
+
+    /// Sorts `v` using strategies optimized for small sizes.
+    fn small_sort<F: FnMut(&Self, &Self) -> bool>(
+        v: &mut [Self],
+        scratch: &mut [MaybeUninit<Self>],
+        is_less: &mut F,
+    );
+}
+
+impl<T> StableSmallSortTypeImpl for T {
+    #[inline(always)]
+    default fn small_sort_threshold() -> usize {
+        // Optimal number of comparisons, and good perf.
+        SMALL_SORT_FALLBACK_THRESHOLD
+    }
+
+    #[inline(always)]
+    default fn small_sort<F: FnMut(&T, &T) -> bool>(
+        v: &mut [T],
+        _scratch: &mut [MaybeUninit<T>],
+        is_less: &mut F,
+    ) {
+        if v.len() >= 2 {
+            insertion_sort_shift_left(v, 1, is_less);
+        }
+    }
+}
+
+impl<T: FreezeMarker> StableSmallSortTypeImpl for T {
+    #[inline(always)]
+    fn small_sort_threshold() -> usize {
+        SMALL_SORT_GENERAL_THRESHOLD
+    }
+
+    #[inline(always)]
+    fn small_sort<F: FnMut(&T, &T) -> bool>(
+        v: &mut [T],
+        scratch: &mut [MaybeUninit<T>],
+        is_less: &mut F,
+    ) {
+        small_sort_general_with_scratch(v, scratch, is_less);
+    }
+}
+
+/// Using a trait allows us to specialize on `Freeze` which in turn allows us to make safe
+/// abstractions.
+pub(crate) trait UnstableSmallSortTypeImpl: Sized {
+    /// For which input length <= return value of this function, is it valid to call `small_sort`.
+    fn small_sort_threshold() -> usize;
+
+    /// Sorts `v` using strategies optimized for small sizes.
+    fn small_sort<F: FnMut(&Self, &Self) -> bool>(v: &mut [Self], is_less: &mut F);
+}
+
+impl<T> UnstableSmallSortTypeImpl for T {
+    #[inline(always)]
+    default fn small_sort_threshold() -> usize {
+        SMALL_SORT_FALLBACK_THRESHOLD
+    }
+
+    #[inline(always)]
+    default fn small_sort<F>(v: &mut [T], is_less: &mut F)
+    where
+        F: FnMut(&T, &T) -> bool,
+    {
+        small_sort_fallback(v, is_less);
+    }
+}
+
+impl<T: FreezeMarker> UnstableSmallSortTypeImpl for T {
+    #[inline(always)]
+    fn small_sort_threshold() -> usize {
+        match const { choose_unstable_small_sort::<T>() } {
+            UnstalbeSmallSort::Fallback => SMALL_SORT_FALLBACK_THRESHOLD,
+            UnstalbeSmallSort::General => SMALL_SORT_GENERAL_THRESHOLD,
+            UnstalbeSmallSort::Network => SMALL_SORT_NETWORK_THRESHOLD,
+        }
+    }
+
+    #[inline(always)]
+    fn small_sort<F>(v: &mut [T], is_less: &mut F)
+    where
+        F: FnMut(&T, &T) -> bool,
+    {
+        // This construct is used to limit the LLVM IR generated, which saves large amounts of
+        // compile-time by only instantiating the code that is needed. Idea by Frank Steffahn.
+        (const { inst_unstable_small_sort::<T, F>() })(v, is_less);
+    }
+}
+
+/// Optimal number of comparisons, and good perf.
+const SMALL_SORT_FALLBACK_THRESHOLD: usize = 16;
+
+/// From a comparison perspective 20 was ~2% more efficient for fully random input, but for
+/// wall-clock performance choosing 32 yielded better performance overall.
+///
+/// SAFETY: If you change this value, you have to adjust [`small_sort_general`] !
+const SMALL_SORT_GENERAL_THRESHOLD: usize = 32;
+
+/// [`small_sort_general`] uses [`sort8_stable`] as primitive and does a kind of ping-pong merge,
+/// where the output of the first two [`sort8_stable`] calls is stored at the end of the scratch
+/// buffer. This simplifies panic handling and avoids additional copies. This affects the required
+/// scratch buffer size.
+///
+/// SAFETY: If you change this value, you have to adjust [`small_sort_general`] !
+pub(crate) const SMALL_SORT_GENERAL_SCRATCH_LEN: usize = SMALL_SORT_GENERAL_THRESHOLD + 16;
+
+/// SAFETY: If you change this value, you have to adjust [`small_sort_network`] !
+const SMALL_SORT_NETWORK_THRESHOLD: usize = 32;
+const SMALL_SORT_NETWORK_SCRATCH_LEN: usize = SMALL_SORT_NETWORK_THRESHOLD;
+
+/// Using a stack array, could cause a stack overflow if the type `T` is very large. To be
+/// conservative we limit the usage of small-sorts that require a stack array to types that fit
+/// within this limit.
+const MAX_STACK_ARRAY_SIZE: usize = 4096;
+
+enum UnstalbeSmallSort {
+    Fallback,
+    General,
+    Network,
+}
+
+const fn choose_unstable_small_sort<T: FreezeMarker>() -> UnstalbeSmallSort {
+    if T::is_copy()
+        && has_efficient_in_place_swap::<T>()
+        && (mem::size_of::<T>() * SMALL_SORT_NETWORK_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE
+    {
+        // Heuristic for int like types.
+        return UnstalbeSmallSort::Network;
+    }
+
+    if (mem::size_of::<T>() * SMALL_SORT_GENERAL_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE {
+        return UnstalbeSmallSort::General;
+    }
+
+    UnstalbeSmallSort::Fallback
+}
+
+const fn inst_unstable_small_sort<T: FreezeMarker, F: FnMut(&T, &T) -> bool>()
+-> fn(&mut [T], &mut F) {
+    match const { choose_unstable_small_sort::<T>() } {
+        UnstalbeSmallSort::Fallback => small_sort_fallback::<T, F>,
+        UnstalbeSmallSort::General => small_sort_general::<T, F>,
+        UnstalbeSmallSort::Network => small_sort_network::<T, F>,
+    }
+}
+
+fn small_sort_fallback<T, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F) {
+    if v.len() >= 2 {
+        insertion_sort_shift_left(v, 1, is_less);
+    }
+}
+
+fn small_sort_general<T: FreezeMarker, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F) {
+    let mut stack_array = MaybeUninit::<[T; SMALL_SORT_GENERAL_SCRATCH_LEN]>::uninit();
+
+    let scratch = unsafe {
+        slice::from_raw_parts_mut(
+            stack_array.as_mut_ptr() as *mut MaybeUninit<T>,
+            SMALL_SORT_GENERAL_SCRATCH_LEN,
+        )
+    };
+
+    small_sort_general_with_scratch(v, scratch, is_less);
+}
+
+fn small_sort_general_with_scratch<T: FreezeMarker, F: FnMut(&T, &T) -> bool>(
+    v: &mut [T],
+    scratch: &mut [MaybeUninit<T>],
+    is_less: &mut F,
+) {
+    let len = v.len();
+    if len < 2 {
+        return;
+    }
+
+    if scratch.len() < len + 16 {
+        intrinsics::abort();
+    }
+
+    let v_base = v.as_mut_ptr();
+    let len_div_2 = len / 2;
+
+    // SAFETY: See individual comments.
+    unsafe {
+        let scratch_base = scratch.as_mut_ptr() as *mut T;
+
+        let presorted_len = if const { mem::size_of::<T>() <= 16 } && len >= 16 {
+            // SAFETY: scratch_base is valid and has enough space.
+            sort8_stable(v_base, scratch_base, scratch_base.add(len), is_less);
+            sort8_stable(
+                v_base.add(len_div_2),
+                scratch_base.add(len_div_2),
+                scratch_base.add(len + 8),
+                is_less,
+            );
+
+            8
+        } else if len >= 8 {
+            // SAFETY: scratch_base is valid and has enough space.
+            sort4_stable(v_base, scratch_base, is_less);
+            sort4_stable(v_base.add(len_div_2), scratch_base.add(len_div_2), is_less);
+
+            4
+        } else {
+            ptr::copy_nonoverlapping(v_base, scratch_base, 1);
+            ptr::copy_nonoverlapping(v_base.add(len_div_2), scratch_base.add(len_div_2), 1);
+
+            1
+        };
+
+        for offset in [0, len_div_2] {
+            // SAFETY: at this point dst is initialized with presorted_len elements.
+            // We extend this to desired_len, src is valid for desired_len elements.
+            let src = v_base.add(offset);
+            let dst = scratch_base.add(offset);
+            let desired_len = if offset == 0 { len_div_2 } else { len - len_div_2 };
+
+            for i in presorted_len..desired_len {
+                ptr::copy_nonoverlapping(src.add(i), dst.add(i), 1);
+                insert_tail(dst, dst.add(i), is_less);
+            }
+        }
+
+        // SAFETY: see comment in `CopyOnDrop::drop`.
+        let drop_guard = CopyOnDrop { src: scratch_base, dst: v_base, len };
+
+        // SAFETY: at this point scratch_base is fully initialized, allowing us
+        // to use it as the source of our merge back into the original array.
+        // If a panic occurs we ensure the original array is restored to a valid
+        // permutation of the input through drop_guard. This technique is similar
+        // to ping-pong merging.
+        bidirectional_merge(
+            &*ptr::slice_from_raw_parts(drop_guard.src, drop_guard.len),
+            drop_guard.dst,
+            is_less,
+        );
+        mem::forget(drop_guard);
+    }
+}
+
+struct CopyOnDrop<T> {
+    src: *const T,
+    dst: *mut T,
+    len: usize,
+}
+
+impl<T> Drop for CopyOnDrop<T> {
+    fn drop(&mut self) {
+        // SAFETY: `src` must contain `len` initialized elements, and dst must
+        // be valid to write `len` elements.
+        unsafe {
+            ptr::copy_nonoverlapping(self.src, self.dst, self.len);
+        }
+    }
+}
+
+fn small_sort_network<T, F>(v: &mut [T], is_less: &mut F)
+where
+    T: FreezeMarker,
+    F: FnMut(&T, &T) -> bool,
+{
+    // This implementation is tuned to be efficient for integer types.
+
+    let len = v.len();
+    if len < 2 {
+        return;
+    }
+
+    if len > SMALL_SORT_NETWORK_SCRATCH_LEN {
+        intrinsics::abort();
+    }
+
+    let mut stack_array = MaybeUninit::<[T; SMALL_SORT_NETWORK_SCRATCH_LEN]>::uninit();
+
+    let len_div_2 = len / 2;
+    let no_merge = len < 18;
+
+    let v_base = v.as_mut_ptr();
+    let initial_region_len = if no_merge { len } else { len_div_2 };
+    // SAFETY: Both possible values of `initial_region_len` are in-bounds.
+    let mut region = unsafe { &mut *ptr::slice_from_raw_parts_mut(v_base, initial_region_len) };
+
+    // Avoid compiler unrolling, we *really* don't want that to happen here for binary-size reasons.
+    loop {
+        let presorted_len = if region.len() >= 13 {
+            sort13_optimal(region, is_less);
+            13
+        } else if region.len() >= 9 {
+            sort9_optimal(region, is_less);
+            9
+        } else {
+            1
+        };
+
+        insertion_sort_shift_left(region, presorted_len, is_less);
+
+        if no_merge {
+            return;
+        }
+
+        if region.as_ptr() != v_base {
+            break;
+        }
+
+        // SAFETY: The right side of `v` based on `len_div_2` is guaranteed in-bounds.
+        region =
+            unsafe { &mut *ptr::slice_from_raw_parts_mut(v_base.add(len_div_2), len - len_div_2) };
+    }
+
+    // SAFETY: We checked that T is Freeze and thus observation safe.
+    // Should is_less panic v was not modified in parity_merge and retains it's original input.
+    // scratch and v must not alias and scratch has v.len() space.
+    unsafe {
+        let scratch_base = stack_array.as_mut_ptr() as *mut T;
+        bidirectional_merge(
+            &mut *ptr::slice_from_raw_parts_mut(v_base, len),
+            scratch_base,
+            is_less,
+        );
+        ptr::copy_nonoverlapping(scratch_base, v_base, len);
+    }
+}
+
+/// Swap two values in the slice pointed to by `v_base` at the position `a_pos` and `b_pos` if the
+/// value at position `b_pos` is less than the one at position `a_pos`.
+pub unsafe fn swap_if_less<T, F>(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F)
+where
+    F: FnMut(&T, &T) -> bool,
+{
+    // SAFETY: the caller must guarantee that `a` and `b` each added to `v_base` yield valid
+    // pointers into `v_base`, and are properly aligned, and part of the same allocation.
+    unsafe {
+        let v_a = v_base.add(a_pos);
+        let v_b = v_base.add(b_pos);
+
+        // PANIC SAFETY: if is_less panics, no scratch memory was created and the slice should still be
+        // in a well defined state, without duplicates.
+
+        // Important to only swap if it is more and not if it is equal. is_less should return false for
+        // equal, so we don't swap.
+        let should_swap = is_less(&*v_b, &*v_a);
+
+        // This is a branchless version of swap if.
+        // The equivalent code with a branch would be:
+        //
+        // if should_swap {
+        //     ptr::swap(left, right, 1);
+        // }
+
+        // The goal is to generate cmov instructions here.
+        let left_swap = if should_swap { v_b } else { v_a };
+        let right_swap = if should_swap { v_a } else { v_b };
+
+        let right_swap_tmp = ManuallyDrop::new(ptr::read(right_swap));
+        ptr::copy(left_swap, v_a, 1);
+        ptr::copy_nonoverlapping(&*right_swap_tmp, v_b, 1);
+    }
+}
+
+// Never inline this function to avoid code bloat. It still optimizes nicely and has practically no
+// performance impact.
+fn sort9_optimal<T, F>(v: &mut [T], is_less: &mut F)
+where
+    F: FnMut(&T, &T) -> bool,
+{
+    if v.len() < 9 {
+        intrinsics::abort();
+    }
+
+    let v_base = v.as_mut_ptr();
+
+    // Optimal sorting network see:
+    // https://bertdobbelaere.github.io/sorting_networks.html.
+
+    // SAFETY: We checked the len.
+    unsafe {
+        swap_if_less(v_base, 0, 3, is_less);
+        swap_if_less(v_base, 1, 7, is_less);
+        swap_if_less(v_base, 2, 5, is_less);
+        swap_if_less(v_base, 4, 8, is_less);
+        swap_if_less(v_base, 0, 7, is_less);
+        swap_if_less(v_base, 2, 4, is_less);
+        swap_if_less(v_base, 3, 8, is_less);
+        swap_if_less(v_base, 5, 6, is_less);
+        swap_if_less(v_base, 0, 2, is_less);
+        swap_if_less(v_base, 1, 3, is_less);
+        swap_if_less(v_base, 4, 5, is_less);
+        swap_if_less(v_base, 7, 8, is_less);
+        swap_if_less(v_base, 1, 4, is_less);
+        swap_if_less(v_base, 3, 6, is_less);
+        swap_if_less(v_base, 5, 7, is_less);
+        swap_if_less(v_base, 0, 1, is_less);
+        swap_if_less(v_base, 2, 4, is_less);
+        swap_if_less(v_base, 3, 5, is_less);
+        swap_if_less(v_base, 6, 8, is_less);
+        swap_if_less(v_base, 2, 3, is_less);
+        swap_if_less(v_base, 4, 5, is_less);
+        swap_if_less(v_base, 6, 7, is_less);
+        swap_if_less(v_base, 1, 2, is_less);
+        swap_if_less(v_base, 3, 4, is_less);
+        swap_if_less(v_base, 5, 6, is_less);
+    }
+}
+
+// Never inline this function to avoid code bloat. It still optimizes nicely and has practically no
+// performance impact.
+fn sort13_optimal<T, F>(v: &mut [T], is_less: &mut F)
+where
+    F: FnMut(&T, &T) -> bool,
+{
+    if v.len() < 13 {
+        intrinsics::abort();
+    }
+
+    let v_base = v.as_mut_ptr();
+
+    // Optimal sorting network see:
+    // https://bertdobbelaere.github.io/sorting_networks.html.
+
+    // SAFETY: We checked the len.
+    unsafe {
+        swap_if_less(v_base, 0, 12, is_less);
+        swap_if_less(v_base, 1, 10, is_less);
+        swap_if_less(v_base, 2, 9, is_less);
+        swap_if_less(v_base, 3, 7, is_less);
+        swap_if_less(v_base, 5, 11, is_less);
+        swap_if_less(v_base, 6, 8, is_less);
+        swap_if_less(v_base, 1, 6, is_less);
+        swap_if_less(v_base, 2, 3, is_less);
+        swap_if_less(v_base, 4, 11, is_less);
+        swap_if_less(v_base, 7, 9, is_less);
+        swap_if_less(v_base, 8, 10, is_less);
+        swap_if_less(v_base, 0, 4, is_less);
+        swap_if_less(v_base, 1, 2, is_less);
+        swap_if_less(v_base, 3, 6, is_less);
+        swap_if_less(v_base, 7, 8, is_less);
+        swap_if_less(v_base, 9, 10, is_less);
+        swap_if_less(v_base, 11, 12, is_less);
+        swap_if_less(v_base, 4, 6, is_less);
+        swap_if_less(v_base, 5, 9, is_less);
+        swap_if_less(v_base, 8, 11, is_less);
+        swap_if_less(v_base, 10, 12, is_less);
+        swap_if_less(v_base, 0, 5, is_less);
+        swap_if_less(v_base, 3, 8, is_less);
+        swap_if_less(v_base, 4, 7, is_less);
+        swap_if_less(v_base, 6, 11, is_less);
+        swap_if_less(v_base, 9, 10, is_less);
+        swap_if_less(v_base, 0, 1, is_less);
+        swap_if_less(v_base, 2, 5, is_less);
+        swap_if_less(v_base, 6, 9, is_less);
+        swap_if_less(v_base, 7, 8, is_less);
+        swap_if_less(v_base, 10, 11, is_less);
+        swap_if_less(v_base, 1, 3, is_less);
+        swap_if_less(v_base, 2, 4, is_less);
+        swap_if_less(v_base, 5, 6, is_less);
+        swap_if_less(v_base, 9, 10, is_less);
+        swap_if_less(v_base, 1, 2, is_less);
+        swap_if_less(v_base, 3, 4, is_less);
+        swap_if_less(v_base, 5, 7, is_less);
+        swap_if_less(v_base, 6, 8, is_less);
+        swap_if_less(v_base, 2, 3, is_less);
+        swap_if_less(v_base, 4, 5, is_less);
+        swap_if_less(v_base, 6, 7, is_less);
+        swap_if_less(v_base, 8, 9, is_less);
+        swap_if_less(v_base, 3, 4, is_less);
+        swap_if_less(v_base, 5, 6, is_less);
+    }
+}
+
+/// Sorts range [begin, tail] assuming [begin, tail) is already sorted.
+///
+/// # Safety
+/// begin < tail and p must be valid and initialized for all begin <= p <= tail.
+unsafe fn insert_tail<T, F: FnMut(&T, &T) -> bool>(begin: *mut T, tail: *mut T, is_less: &mut F) {
+    // SAFETY: see individual comments.
+    unsafe {
+        // SAFETY: in-bounds as tail > begin.
+        let mut sift = tail.sub(1);
+        if !is_less(&*tail, &*sift) {
+            return;
+        }
+
+        // SAFETY: after this read tail is never read from again, as we only ever
+        // read from sift, sift < tail and we only ever decrease sift. Thus this is
+        // effectively a move, not a copy. Should a panic occur, or we have found
+        // the correct insertion position, gap_guard ensures the element is moved
+        // back into the array.
+        let tmp = ManuallyDrop::new(tail.read());
+        let mut gap_guard = CopyOnDrop { src: &*tmp, dst: tail, len: 1 };
+
+        loop {
+            // SAFETY: we move sift into the gap (which is valid), and point the
+            // gap guard destination at sift, ensuring that if a panic occurs the
+            // gap is once again filled.
+            ptr::copy_nonoverlapping(sift, gap_guard.dst, 1);
+            gap_guard.dst = sift;
+
+            if sift == begin {
+                break;
+            }
+
+            // SAFETY: we checked that sift != begin, thus this is in-bounds.
+            sift = sift.sub(1);
+            if !is_less(&tmp, &*sift) {
+                break;
+            }
+        }
+    }
+}
+
+/// Sort `v` assuming `v[..offset]` is already sorted.
+pub fn insertion_sort_shift_left<T, F: FnMut(&T, &T) -> bool>(
+    v: &mut [T],
+    offset: usize,
+    is_less: &mut F,
+) {
+    let len = v.len();
+    if offset == 0 || offset > len {
+        intrinsics::abort();
+    }
+
+    // SAFETY: see individual comments.
+    unsafe {
+        // We write this basic loop directly using pointers, as when we use a
+        // for loop LLVM likes to unroll this loop which we do not want.
+        // SAFETY: v_end is the one-past-end pointer, and we checked that
+        // offset <= len, thus tail is also in-bounds.
+        let v_base = v.as_mut_ptr();
+        let v_end = v_base.add(len);
+        let mut tail = v_base.add(offset);
+        while tail != v_end {
+            // SAFETY: v_base and tail are both valid pointers to elements, and
+            // v_base < tail since we checked offset != 0.
+            insert_tail(v_base, tail, is_less);
+
+            // SAFETY: we checked that tail is not yet the one-past-end pointer.
+            tail = tail.add(1);
+        }
+    }
+}
+
+/// SAFETY: The caller MUST guarantee that `v_base` is valid for 4 reads and
+/// `dst` is valid for 4 writes. The result will be stored in `dst[0..4]`.
+pub unsafe fn sort4_stable<T, F: FnMut(&T, &T) -> bool>(
+    v_base: *const T,
+    dst: *mut T,
+    is_less: &mut F,
+) {
+    // By limiting select to picking pointers, we are guaranteed good cmov code-gen
+    // regardless of type T's size. Further this only does 5 instead of 6
+    // comparisons compared to a stable transposition 4 element sorting-network,
+    // and always copies each element exactly once.
+
+    // SAFETY: all pointers have offset at most 3 from v_base and dst, and are
+    // thus in-bounds by the precondition.
+    unsafe {
+        // Stably create two pairs a <= b and c <= d.
+        let c1 = is_less(&*v_base.add(1), &*v_base);
+        let c2 = is_less(&*v_base.add(3), &*v_base.add(2));
+        let a = v_base.add(c1 as usize);
+        let b = v_base.add(!c1 as usize);
+        let c = v_base.add(2 + c2 as usize);
+        let d = v_base.add(2 + (!c2 as usize));
+
+        // Compare (a, c) and (b, d) to identify max/min. We're left with two
+        // unknown elements, but because we are a stable sort we must know which
+        // one is leftmost and which one is rightmost.
+        // c3, c4 | min max unknown_left unknown_right
+        //  0,  0 |  a   d    b         c
+        //  0,  1 |  a   b    c         d
+        //  1,  0 |  c   d    a         b
+        //  1,  1 |  c   b    a         d
+        let c3 = is_less(&*c, &*a);
+        let c4 = is_less(&*d, &*b);
+        let min = select(c3, c, a);
+        let max = select(c4, b, d);
+        let unknown_left = select(c3, a, select(c4, c, b));
+        let unknown_right = select(c4, d, select(c3, b, c));
+
+        // Sort the last two unknown elements.
+        let c5 = is_less(&*unknown_right, &*unknown_left);
+        let lo = select(c5, unknown_right, unknown_left);
+        let hi = select(c5, unknown_left, unknown_right);
+
+        ptr::copy_nonoverlapping(min, dst, 1);
+        ptr::copy_nonoverlapping(lo, dst.add(1), 1);
+        ptr::copy_nonoverlapping(hi, dst.add(2), 1);
+        ptr::copy_nonoverlapping(max, dst.add(3), 1);
+    }
+
+    #[inline(always)]
+    fn select<T>(cond: bool, if_true: *const T, if_false: *const T) -> *const T {
+        if cond { if_true } else { if_false }
+    }
+}
+
+/// SAFETY: The caller MUST guarantee that `v_base` is valid for 8 reads and
+/// writes, `scratch_base` and `dst` MUST be valid for 8 writes. The result will
+/// be stored in `dst[0..8]`.
+unsafe fn sort8_stable<T: FreezeMarker, F: FnMut(&T, &T) -> bool>(
+    v_base: *mut T,
+    dst: *mut T,
+    scratch_base: *mut T,
+    is_less: &mut F,
+) {
+    // SAFETY: these pointers are all in-bounds by the precondition of our function.
+    unsafe {
+        sort4_stable(v_base, scratch_base, is_less);
+        sort4_stable(v_base.add(4), scratch_base.add(4), is_less);
+    }
+
+    // SAFETY: scratch_base[0..8] is now initialized, allowing us to merge back
+    // into dst.
+    unsafe {
+        bidirectional_merge(&*ptr::slice_from_raw_parts(scratch_base, 8), dst, is_less);
+    }
+}
+
+#[inline(always)]
+unsafe fn merge_up<T, F: FnMut(&T, &T) -> bool>(
+    mut left_src: *const T,
+    mut right_src: *const T,
+    mut dst: *mut T,
+    is_less: &mut F,
+) -> (*const T, *const T, *mut T) {
+    // This is a branchless merge utility function.
+    // The equivalent code with a branch would be:
+    //
+    // if !is_less(&*right_src, &*left_src) {
+    //     ptr::copy_nonoverlapping(left_src, dst, 1);
+    //     left_src = left_src.add(1);
+    // } else {
+    //     ptr::copy_nonoverlapping(right_src, dst, 1);
+    //     right_src = right_src.add(1);
+    // }
+    // dst = dst.add(1);
+
+    // SAFETY: The caller must guarantee that `left_src`, `right_src` are valid
+    // to read and `dst` is valid to write, while not aliasing.
+    unsafe {
+        let is_l = !is_less(&*right_src, &*left_src);
+        let src = if is_l { left_src } else { right_src };
+        ptr::copy_nonoverlapping(src, dst, 1);
+        right_src = right_src.add(!is_l as usize);
+        left_src = left_src.add(is_l as usize);
+        dst = dst.add(1);
+    }
+
+    (left_src, right_src, dst)
+}
+
+#[inline(always)]
+unsafe fn merge_down<T, F: FnMut(&T, &T) -> bool>(
+    mut left_src: *const T,
+    mut right_src: *const T,
+    mut dst: *mut T,
+    is_less: &mut F,
+) -> (*const T, *const T, *mut T) {
+    // This is a branchless merge utility function.
+    // The equivalent code with a branch would be:
+    //
+    // if !is_less(&*right_src, &*left_src) {
+    //     ptr::copy_nonoverlapping(right_src, dst, 1);
+    //     right_src = right_src.wrapping_sub(1);
+    // } else {
+    //     ptr::copy_nonoverlapping(left_src, dst, 1);
+    //     left_src = left_src.wrapping_sub(1);
+    // }
+    // dst = dst.sub(1);
+
+    // SAFETY: The caller must guarantee that `left_src`, `right_src` are valid
+    // to read and `dst` is valid to write, while not aliasing.
+    unsafe {
+        let is_l = !is_less(&*right_src, &*left_src);
+        let src = if is_l { right_src } else { left_src };
+        ptr::copy_nonoverlapping(src, dst, 1);
+        right_src = right_src.wrapping_sub(is_l as usize);
+        left_src = left_src.wrapping_sub(!is_l as usize);
+        dst = dst.sub(1);
+    }
+
+    (left_src, right_src, dst)
+}
+
+/// Merge v assuming v[..len / 2] and v[len / 2..] are sorted.
+///
+/// Original idea for bi-directional merging by Igor van den Hoven (quadsort),
+/// adapted to only use merge up and down. In contrast to the original
+/// parity_merge function, it performs 2 writes instead of 4 per iteration.
+///
+/// # Safety
+/// The caller must guarantee that `dst` is valid for v.len() writes.
+/// Also `v.as_ptr()` and `dst` must not alias and v.len() must be >= 2.
+///
+/// Note that T must be Freeze, the comparison function is evaluated on outdated
+/// temporary 'copies' that may not end up in the final array.
+unsafe fn bidirectional_merge<T: FreezeMarker, F: FnMut(&T, &T) -> bool>(
+    v: &[T],
+    dst: *mut T,
+    is_less: &mut F,
+) {
+    // It helps to visualize the merge:
+    //
+    // Initial:
+    //
+    //  |dst (in dst)
+    //  |left               |right
+    //  v                   v
+    // [xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx]
+    //                     ^                   ^
+    //                     |left_rev           |right_rev
+    //                                         |dst_rev (in dst)
+    //
+    // After:
+    //
+    //                      |dst (in dst)
+    //        |left         |           |right
+    //        v             v           v
+    // [xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx]
+    //       ^             ^           ^
+    //       |left_rev     |           |right_rev
+    //                     |dst_rev (in dst)
+    //
+    // In each iteration one of left or right moves up one position, and one of
+    // left_rev or right_rev moves down one position, whereas dst always moves
+    // up one position and dst_rev always moves down one position. Assuming
+    // the input was sorted and the comparison function is correctly implemented
+    // at the end we will have left == left_rev + 1, and right == right_rev + 1,
+    // fully consuming the input having written it to dst.
+
+    let len = v.len();
+    let src = v.as_ptr();
+
+    let len_div_2 = len / 2;
+
+    // SAFETY: The caller has to ensure that len >= 2.
+    unsafe {
+        intrinsics::assume(len_div_2 != 0); // This can avoid useless code-gen.
+    }
+
+    // SAFETY: no matter what the result of the user-provided comparison function
+    // is, all 4 read pointers will always be in-bounds. Writing `dst` and `dst_rev`
+    // will always be in bounds if the caller guarantees that `dst` is valid for
+    // `v.len()` writes.
+    unsafe {
+        let mut left = src;
+        let mut right = src.add(len_div_2);
+        let mut dst = dst;
+
+        let mut left_rev = src.add(len_div_2 - 1);
+        let mut right_rev = src.add(len - 1);
+        let mut dst_rev = dst.add(len - 1);
+
+        for _ in 0..len_div_2 {
+            (left, right, dst) = merge_up(left, right, dst, is_less);
+            (left_rev, right_rev, dst_rev) = merge_down(left_rev, right_rev, dst_rev, is_less);
+        }
+
+        let left_end = left_rev.wrapping_add(1);
+        let right_end = right_rev.wrapping_add(1);
+
+        // Odd length, so one element is left unconsumed in the input.
+        if len % 2 != 0 {
+            let left_nonempty = left < left_end;
+            let last_src = if left_nonempty { left } else { right };
+            ptr::copy_nonoverlapping(last_src, dst, 1);
+            left = left.add(left_nonempty as usize);
+            right = right.add((!left_nonempty) as usize);
+        }
+
+        // We now should have consumed the full input exactly once. This can
+        // only fail if the comparison operator fails to be Ord, in which case
+        // we will panic and never access the inconsistent state in dst.
+        if left != left_end || right != right_end {
+            panic_on_ord_violation();
+        }
+    }
+}
+
+#[inline(never)]
+fn panic_on_ord_violation() -> ! {
+    panic!("Ord violation");
+}
+
+#[must_use]
+pub(crate) const fn has_efficient_in_place_swap<T>() -> bool {
+    // Heuristic that holds true on all tested 64-bit capable architectures.
+    mem::size_of::<T>() <= 8 // mem::size_of::<u64>()
+}
+
+#[test]
+fn type_info() {
+    assert!(has_efficient_in_place_swap::<i32>());
+    assert!(has_efficient_in_place_swap::<u64>());
+    assert!(!has_efficient_in_place_swap::<u128>());
+    assert!(!has_efficient_in_place_swap::<String>());
+}
+
+/// SAFETY: Only used for run-time optimization heuristic.
+#[rustc_unsafe_specialization_marker]
+trait CopyMarker {}
+
+impl<T: Copy> CopyMarker for T {}
+
+#[const_trait]
+trait IsCopy {
+    fn is_copy() -> bool;
+}
+
+impl<T> const IsCopy for T {
+    default fn is_copy() -> bool {
+        false
+    }
+}
+impl<T: CopyMarker> const IsCopy for T {
+    fn is_copy() -> bool {
+        true
+    }
+}
@@ -0,0 +1,300 @@
+//! This module contains the hybrid top-level loop combining bottom-up Mergesort with top-down
+//! Quicksort.
+
+use crate::cmp;
+use crate::intrinsics;
+use crate::mem::MaybeUninit;
+
+use crate::slice::sort::shared::find_existing_run;
+use crate::slice::sort::shared::smallsort::StableSmallSortTypeImpl;
+use crate::slice::sort::stable::merge::merge;
+use crate::slice::sort::stable::quicksort::quicksort;
+
+/// Sorts `v` based on comparison function `is_less`. If `eager_sort` is true,
+/// it will only do small-sorts and physical merges, ensuring O(N * log(N))
+/// worst-case complexity. `scratch.len()` must be at least `max(v.len() / 2,
+/// MIN_SMALL_SORT_SCRATCH_LEN)` otherwise the implementation may abort.
+/// Fully ascending and descending inputs will be sorted with exactly N - 1
+/// comparisons.
+///
+/// This is the main loop for driftsort, which uses powersort's heuristic to
+/// determine in which order to merge runs, see below for details.
+pub fn sort<T, F: FnMut(&T, &T) -> bool>(
+    v: &mut [T],
+    scratch: &mut [MaybeUninit<T>],
+    eager_sort: bool,
+    is_less: &mut F,
+) {
+    let len = v.len();
+    if len < 2 {
+        return; // Removing this length check *increases* code size.
+    }
+    let scale_factor = merge_tree_scale_factor(len);
+
+    // It's important to have a relatively high entry barrier for pre-sorted
+    // runs, as the presence of a single such run will force on average several
+    // merge operations and shrink the maximum quicksort size a lot. For that
+    // reason we use sqrt(len) as our pre-sorted run threshold.
+    const MIN_SQRT_RUN_LEN: usize = 64;
+    let min_good_run_len = if len <= (MIN_SQRT_RUN_LEN * MIN_SQRT_RUN_LEN) {
+        // For small input length `MIN_SQRT_RUN_LEN` would break pattern
+        // detection of full or nearly sorted inputs.
+        cmp::min(len - len / 2, MIN_SQRT_RUN_LEN)
+    } else {
+        sqrt_approx(len)
+    };
+
+    // (stack_len, runs, desired_depths) together form a stack maintaining run
+    // information for the powersort heuristic. desired_depths[i] is the desired
+    // depth of the merge node that merges runs[i] with the run that comes after
+    // it.
+    let mut stack_len = 0;
+    let mut run_storage = MaybeUninit::<[DriftsortRun; 66]>::uninit();
+    let runs: *mut DriftsortRun = run_storage.as_mut_ptr().cast();
+    let mut desired_depth_storage = MaybeUninit::<[u8; 66]>::uninit();
+    let desired_depths: *mut u8 = desired_depth_storage.as_mut_ptr().cast();
+
+    let mut scan_idx = 0;
+    let mut prev_run = DriftsortRun::new_sorted(0); // Initial dummy run.
+    loop {
+        // Compute the next run and the desired depth of the merge node between
+        // prev_run and next_run. On the last iteration we create a dummy run
+        // with root-level desired depth to fully collapse the merge tree.
+        let (next_run, desired_depth);
+        if scan_idx < len {
+            next_run =
+                create_run(&mut v[scan_idx..], scratch, min_good_run_len, eager_sort, is_less);
+            desired_depth = merge_tree_depth(
+                scan_idx - prev_run.len(),
+                scan_idx,
+                scan_idx + next_run.len(),
+                scale_factor,
+            );
+        } else {
+            next_run = DriftsortRun::new_sorted(0);
+            desired_depth = 0;
+        };
+
+        // Process the merge nodes between earlier runs[i] that have a desire to
+        // be deeper in the merge tree than the merge node for the splitpoint
+        // between prev_run and next_run.
+        //
+        // SAFETY: first note that this is the only place we modify stack_len,
+        // runs or desired depths. We maintain the following invariants:
+        //  1. The first stack_len elements of runs/desired_depths are initialized.
+        //  2. For all valid i > 0, desired_depths[i] < desired_depths[i+1].
+        //  3. The sum of all valid runs[i].len() plus prev_run.len() equals
+        //     scan_idx.
+        unsafe {
+            while stack_len > 1 && *desired_depths.add(stack_len - 1) >= desired_depth {
+                // Desired depth greater than the upcoming desired depth, pop
+                // left neighbor run from stack and merge into prev_run.
+                let left = *runs.add(stack_len - 1);
+                let merged_len = left.len() + prev_run.len();
+                let merge_start_idx = scan_idx - merged_len;
+                let merge_slice = v.get_unchecked_mut(merge_start_idx..scan_idx);
+                prev_run = logical_merge(merge_slice, scratch, left, prev_run, is_less);
+                stack_len -= 1;
+            }
+
+            // We now know that desired_depths[stack_len - 1] < desired_depth,
+            // maintaining our invariant. This also guarantees we don't overflow
+            // the stack as merge_tree_depth(..) <= 64 and thus we can only have
+            // 64 distinct values on the stack before pushing, plus our initial
+            // dummy run, while our capacity is 66.
+            *runs.add(stack_len) = prev_run;
+            *desired_depths.add(stack_len) = desired_depth;
+            stack_len += 1;
+        }
+
+        // Break before overriding the last run with our dummy run.
+        if scan_idx >= len {
+            break;
+        }
+
+        scan_idx += next_run.len();
+        prev_run = next_run;
+    }
+
+    if !prev_run.sorted() {
+        stable_quicksort(v, scratch, is_less);
+    }
+}
+
+// Nearly-Optimal Mergesorts: Fast, Practical Sorting Methods That Optimally
+// Adapt to Existing Runs by J. Ian Munro and Sebastian Wild.
+//
+// This method forms a binary merge tree, where each internal node corresponds
+// to a splitting point between the adjacent runs that have to be merged. If we
+// visualize our array as the number line from 0 to 1, we want to find the
+// dyadic fraction with smallest denominator that lies between the midpoints of
+// our to-be-merged slices. The exponent in the dyadic fraction indicates the
+// desired depth in the binary merge tree this internal node wishes to have.
+// This does not always correspond to the actual depth due to the inherent
+// imbalance in runs, but we follow it as closely as possible.
+//
+// As an optimization we rescale the number line from [0, 1) to [0, 2^62). Then
+// finding the simplest dyadic fraction between midpoints corresponds to finding
+// the most significant bit difference of the midpoints. We save scale_factor =
+// ceil(2^62 / n) to perform this rescaling using a multiplication, avoiding
+// having to repeatedly do integer divides. This rescaling isn't exact when n is
+// not a power of two since we use integers and not reals, but the result is
+// very close, and in fact when n < 2^30 the resulting tree is equivalent as the
+// approximation errors stay entirely in the lower order bits.
+//
+// Thus for the splitting point between two adjacent slices [a, b) and [b, c)
+// the desired depth of the corresponding merge node is CLZ((a+b)*f ^ (b+c)*f),
+// where CLZ counts the number of leading zeros in an integer and f is our scale
+// factor. Note that we omitted the division by two in the midpoint
+// calculations, as this simply shifts the bits by one position (and thus always
+// adds one to the result), and we only care about the relative depths.
+//
+// Finally, if we try to upper bound x = (a+b)*f giving x = (n-1 + n) * ceil(2^62 / n) then
+//    x < (2^62 / n + 1) * 2n
+//    x < 2^63 + 2n
+// So as long as n < 2^62 we find that x < 2^64, meaning our operations do not
+// overflow.
+#[inline(always)]
+fn merge_tree_scale_factor(n: usize) -> u64 {
+    if usize::BITS > u64::BITS {
+        panic!("Platform not supported");
+    }
+
+    ((1 << 62) + n as u64 - 1) / n as u64
+}
+
+// Note: merge_tree_depth output is < 64 when left < right as f*x and f*y must
+// differ in some bit, and is <= 64 always.
+#[inline(always)]
+fn merge_tree_depth(left: usize, mid: usize, right: usize, scale_factor: u64) -> u8 {
+    let x = left as u64 + mid as u64;
+    let y = mid as u64 + right as u64;
+    ((scale_factor * x) ^ (scale_factor * y)).leading_zeros() as u8
+}
+
+fn sqrt_approx(n: usize) -> usize {
+    // Note that sqrt(n) = n^(1/2), and that 2^log2(n) = n. We combine these
+    // two facts to approximate sqrt(n) as 2^(log2(n) / 2). Because our integer
+    // log floors we want to add 0.5 to compensate for this on average, so our
+    // initial approximation is 2^((1 + floor(log2(n))) / 2).
+    //
+    // We then apply an iteration of Newton's method to improve our
+    // approximation, which for sqrt(n) is a1 = (a0 + n / a0) / 2.
+    //
+    // Finally we note that the exponentiation / division can be done directly
+    // with shifts. We OR with 1 to avoid zero-checks in the integer log.
+    let ilog = (n | 1).ilog2();
+    let shift = (1 + ilog) / 2;
+    ((1 << shift) + (n >> shift)) / 2
+}
+
+// Lazy logical runs as in Glidesort.
+#[inline(always)]
+fn logical_merge<T, F: FnMut(&T, &T) -> bool>(
+    v: &mut [T],
+    scratch: &mut [MaybeUninit<T>],
+    left: DriftsortRun,
+    right: DriftsortRun,
+    is_less: &mut F,
+) -> DriftsortRun {
+    // If one or both of the runs are sorted do a physical merge, using
+    // quicksort to sort the unsorted run if present. We also *need* to
+    // physically merge if the combined runs would not fit in the scratch space
+    // anymore (as this would mean we are no longer able to to quicksort them).
+    let len = v.len();
+    let can_fit_in_scratch = len <= scratch.len();
+    if !can_fit_in_scratch || left.sorted() || right.sorted() {
+        if !left.sorted() {
+            stable_quicksort(&mut v[..left.len()], scratch, is_less);
+        }
+        if !right.sorted() {
+            stable_quicksort(&mut v[left.len()..], scratch, is_less);
+        }
+        merge(v, scratch, left.len(), is_less);
+
+        DriftsortRun::new_sorted(len)
+    } else {
+        DriftsortRun::new_unsorted(len)
+    }
+}
+
+/// Creates a new logical run.
+///
+/// A logical run can either be sorted or unsorted. If there is a pre-existing
+/// run that clears the `min_good_run_len` threshold it is returned as a sorted
+/// run. If not, the result depends on the value of `eager_sort`. If it is true,
+/// then a sorted run of length `T::SMALL_SORT_THRESHOLD` is returned, and if it
+/// is false an unsorted run of length `min_good_run_len` is returned.
+fn create_run<T, F: FnMut(&T, &T) -> bool>(
+    v: &mut [T],
+    scratch: &mut [MaybeUninit<T>],
+    min_good_run_len: usize,
+    eager_sort: bool,
+    is_less: &mut F,
+) -> DriftsortRun {
+    let len = v.len();
+    if len >= min_good_run_len {
+        let (run_len, was_reversed) = find_existing_run(v, is_less);
+
+        // SAFETY: find_existing_run promises to return a valid run_len.
+        unsafe { intrinsics::assume(run_len <= len) };
+
+        if run_len >= min_good_run_len {
+            if was_reversed {
+                v[..run_len].reverse();
+            }
+
+            return DriftsortRun::new_sorted(run_len);
+        }
+    }
+
+    if eager_sort {
+        // We call quicksort with a len that will immediately call small-sort.
+        // By not calling the small-sort directly here it can always be inlined into
+        // the quicksort itself, making the recursive base case faster and is generally
+        // more binary-size efficient.
+        let eager_run_len = cmp::min(T::small_sort_threshold(), len);
+        quicksort(&mut v[..eager_run_len], scratch, 0, None, is_less);
+        DriftsortRun::new_sorted(eager_run_len)
+    } else {
+        DriftsortRun::new_unsorted(cmp::min(min_good_run_len, len))
+    }
+}
+
+fn stable_quicksort<T, F: FnMut(&T, &T) -> bool>(
+    v: &mut [T],
+    scratch: &mut [MaybeUninit<T>],
+    is_less: &mut F,
+) {
+    // Limit the number of imbalanced partitions to `2 * floor(log2(len))`.
+    // The binary OR by one is used to eliminate the zero-check in the logarithm.
+    let limit = 2 * (v.len() | 1).ilog2();
+    quicksort(v, scratch, limit, None, is_less);
+}
+
+/// Compactly stores the length of a run, and whether or not it is sorted. This
+/// can always fit in a usize because the maximum slice length is isize::MAX.
+#[derive(Copy, Clone)]
+struct DriftsortRun(usize);
+
+impl DriftsortRun {
+    #[inline(always)]
+    fn new_sorted(length: usize) -> Self {
+        Self((length << 1) | 1)
+    }
+
+    #[inline(always)]
+    fn new_unsorted(length: usize) -> Self {
+        Self(length << 1)
+    }
+
+    #[inline(always)]
+    fn sorted(self) -> bool {
+        self.0 & 1 == 1
+    }
+
+    #[inline(always)]
+    fn len(self) -> usize {
+        self.0 >> 1
+    }
+}
@@ -0,0 +1,151 @@
+//! This module contains logic for performing a merge of two sorted sub-slices.
+
+use crate::cmp;
+use crate::mem::MaybeUninit;
+use crate::ptr;
+
+/// Merges non-decreasing runs `v[..mid]` and `v[mid..]` using `scratch` as
+/// temporary storage, and stores the result into `v[..]`.
+pub fn merge<T, F: FnMut(&T, &T) -> bool>(
+    v: &mut [T],
+    scratch: &mut [MaybeUninit<T>],
+    mid: usize,
+    is_less: &mut F,
+) {
+    let len = v.len();
+
+    if mid == 0 || mid >= len || scratch.len() < cmp::min(mid, len - mid) {
+        return;
+    }
+
+    // SAFETY: We checked that the two slices are non-empty and `mid` is in-bounds.
+    // We checked that the buffer `scratch` has enough capacity to hold a copy of
+    // the shorter slice. `merge_up` and `merge_down` are written in such a way that
+    // they uphold the contract described in `MergeState::drop`.
+    unsafe {
+        // The merge process first copies the shorter run into `buf`. Then it traces
+        // the newly copied run and the longer run forwards (or backwards), comparing
+        // their next unconsumed elements and copying the lesser (or greater) one into `v`.
+        //
+        // As soon as the shorter run is fully consumed, the process is done. If the
+        // longer run gets consumed first, then we must copy whatever is left of the
+        // shorter run into the remaining gap in `v`.
+        //
+        // Intermediate state of the process is always tracked by `gap`, which serves
+        // two purposes:
+        //  1. Protects integrity of `v` from panics in `is_less`.
+        //  2. Fills the remaining gap in `v` if the longer run gets consumed first.
+
+        let buf = MaybeUninit::slice_as_mut_ptr(scratch);
+
+        let v_base = v.as_mut_ptr();
+        let v_mid = v_base.add(mid);
+        let v_end = v_base.add(len);
+
+        let left_len = mid;
+        let right_len = len - mid;
+
+        let left_is_shorter = left_len <= right_len;
+        let save_base = if left_is_shorter { v_base } else { v_mid };
+        let save_len = if left_is_shorter { left_len } else { right_len };
+
+        ptr::copy_nonoverlapping(save_base, buf, save_len);
+
+        let mut merge_state = MergeState { start: buf, end: buf.add(save_len), dst: save_base };
+
+        if left_is_shorter {
+            merge_state.merge_up(v_mid, v_end, is_less);
+        } else {
+            merge_state.merge_down(v_base, buf, v_end, is_less);
+        }
+        // Finally, `merge_state` gets dropped. If the shorter run was not fully
+        // consumed, whatever remains of it will now be copied into the hole in `v`.
+    }
+
+    // When dropped, copies the range `start..end` into `dst..`.
+    struct MergeState<T> {
+        start: *mut T,
+        end: *mut T,
+        dst: *mut T,
+    }
+
+    impl<T> MergeState<T> {
+        /// # Safety
+        /// The caller MUST guarantee that `self` is initialized in a way where `start -> end` is
+        /// the longer sub-slice and so that `dst` can be written to at least the shorter sub-slice
+        /// length times. In addition `start -> end` and `right -> right_end` MUST be valid to be
+        /// read. This function MUST only be called once.
+        unsafe fn merge_up<F: FnMut(&T, &T) -> bool>(
+            &mut self,
+            mut right: *const T,
+            right_end: *const T,
+            is_less: &mut F,
+        ) {
+            // SAFETY: See function safety comment.
+            unsafe {
+                let left = &mut self.start;
+                let out = &mut self.dst;
+
+                while *left != self.end && right as *const T != right_end {
+                    let consume_left = !is_less(&*right, &**left);
+
+                    let src = if consume_left { *left } else { right };
+                    ptr::copy_nonoverlapping(src, *out, 1);
+
+                    *left = left.add(consume_left as usize);
+                    right = right.add(!consume_left as usize);
+
+                    *out = out.add(1);
+                }
+            }
+        }
+
+        /// # Safety
+        /// The caller MUST guarantee that `self` is initialized in a way where `left_end <- dst` is
+        /// the shorter sub-slice and so that `out` can be written to at least the shorter sub-slice
+        /// length times. In addition `left_end <- dst` and `right_end <- end` MUST be valid to be
+        /// read. This function MUST only be called once.
+        unsafe fn merge_down<F: FnMut(&T, &T) -> bool>(
+            &mut self,
+            left_end: *const T,
+            right_end: *const T,
+            mut out: *mut T,
+            is_less: &mut F,
+        ) {
+            // SAFETY: See function safety comment.
+            unsafe {
+                loop {
+                    let left = self.dst.sub(1);
+                    let right = self.end.sub(1);
+                    out = out.sub(1);
+
+                    let consume_left = is_less(&*right, &*left);
+
+                    let src = if consume_left { left } else { right };
+                    ptr::copy_nonoverlapping(src, out, 1);
+
+                    self.dst = left.add(!consume_left as usize);
+                    self.end = right.add(consume_left as usize);
+
+                    if self.dst as *const T == left_end || self.end as *const T == right_end {
+                        break;
+                    }
+                }
+            }
+        }
+    }
+
+    impl<T> Drop for MergeState<T> {
+        fn drop(&mut self) {
+            // SAFETY: The user of MergeState MUST ensure, that at any point this drop
+            // impl MAY run, for example when the user provided `is_less` panics, that
+            // copying the contiguous region between `start` and `end` to `dst` will
+            // leave the input slice `v` with each original element and all possible
+            // modifications observed.
+            unsafe {
+                let len = self.end.sub_ptr(self.start);
+                ptr::copy_nonoverlapping(self.start, self.dst, len);
+            }
+        }
+    }
+}
@@ -0,0 +1,116 @@
+//! This module contains the entry points for `slice::sort`.
+
+use crate::cmp;
+use crate::intrinsics;
+use crate::mem::{self, MaybeUninit, SizedTypeProperties};
+
+use crate::slice::sort::shared::smallsort::{
+    insertion_sort_shift_left, StableSmallSortTypeImpl, SMALL_SORT_GENERAL_SCRATCH_LEN,
+};
+
+pub(crate) mod drift;
+pub(crate) mod merge;
+pub(crate) mod quicksort;
+
+/// Stable sort called driftsort by Orson Peters and Lukas Bergdoll.
+/// Design document:
+/// https://github.com/Voultapher/sort-research-rs/blob/main/writeup/driftsort_introduction/text.md
+///
+/// Upholds all safety properties outlined here:
+/// https://github.com/Voultapher/sort-research-rs/blob/main/writeup/sort_safety/text.md
+#[inline(always)]
+pub fn sort<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], is_less: &mut F) {
+    // Arrays of zero-sized types are always all-equal, and thus sorted.
+    if T::IS_ZST {
+        return;
+    }
+
+    // Instrumenting the standard library showed that 90+% of the calls to sort
+    // by rustc are either of size 0 or 1.
+    let len = v.len();
+    if intrinsics::likely(len < 2) {
+        return;
+    }
+
+    // More advanced sorting methods than insertion sort are faster if called in
+    // a hot loop for small inputs, but for general-purpose code the small
+    // binary size of insertion sort is more important. The instruction cache in
+    // modern processors is very valuable, and for a single sort call in general
+    // purpose code any gains from an advanced method are cancelled by i-cache
+    // misses during the sort, and thrashing the i-cache for surrounding code.
+    const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
+    if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
+        insertion_sort_shift_left(v, 1, is_less);
+        return;
+    }
+
+    driftsort_main::<T, F, BufT>(v, is_less);
+}
+
+/// See [`sort`]
+///
+/// Deliberately don't inline the main sorting routine entrypoint to ensure the
+/// inlined insertion sort i-cache footprint remains minimal.
+#[inline(never)]
+fn driftsort_main<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], is_less: &mut F) {
+    // By allocating n elements of memory we can ensure the entire input can
+    // be sorted using stable quicksort, which allows better performance on
+    // random and low-cardinality distributions. However, we still want to
+    // reduce our memory usage to n / 2 for large inputs. We do this by scaling
+    // our allocation as max(n / 2, min(n, 8MB)), ensuring we scale like n for
+    // small inputs and n / 2 for large inputs, without a sudden drop off. We
+    // also need to ensure our alloc >= MIN_SMALL_SORT_SCRATCH_LEN, as the
+    // small-sort always needs this much memory.
+    const MAX_FULL_ALLOC_BYTES: usize = 8_000_000; // 8MB
+    let max_full_alloc = MAX_FULL_ALLOC_BYTES / mem::size_of::<T>();
+    let len = v.len();
+    let alloc_len =
+        cmp::max(cmp::max(len / 2, cmp::min(len, max_full_alloc)), SMALL_SORT_GENERAL_SCRATCH_LEN);
+
+    // For small inputs 4KiB of stack storage suffices, which allows us to avoid
+    // calling the (de-)allocator. Benchmarks showed this was quite beneficial.
+    let mut stack_buf = AlignedStorage::<T, 4096>::new();
+    let stack_scratch = stack_buf.as_uninit_slice_mut();
+    let mut heap_buf;
+    let scratch = if stack_scratch.len() >= alloc_len {
+        stack_scratch
+    } else {
+        heap_buf = BufT::with_capacity(alloc_len);
+        heap_buf.as_uninit_slice_mut()
+    };
+
+    // For small inputs using quicksort is not yet beneficial, and a single
+    // small-sort or two small-sorts plus a single merge outperforms it, so use
+    // eager mode.
+    let eager_sort = len <= T::small_sort_threshold() * 2;
+    crate::slice::sort::stable::drift::sort(v, scratch, eager_sort, is_less);
+}
+
+#[doc(hidden)]
+/// Abstracts owned memory buffer, so that sort code can live in core where no allocation is
+/// possible. This trait can then be implemented in a place that has access to allocation.
+pub trait BufGuard<T> {
+    /// Creates new buffer that holds at least `capacity` memory.
+    fn with_capacity(capacity: usize) -> Self;
+    /// Returns mutable access to uninitialized memory owned by the buffer.
+    fn as_uninit_slice_mut(&mut self) -> &mut [MaybeUninit<T>];
+}
+
+#[repr(C)]
+struct AlignedStorage<T, const N: usize> {
+    _align: [T; 0],
+    storage: [MaybeUninit<u8>; N],
+}
+
+impl<T, const N: usize> AlignedStorage<T, N> {
+    fn new() -> Self {
+        Self { _align: [], storage: MaybeUninit::uninit_array() }
+    }
+
+    fn as_uninit_slice_mut(&mut self) -> &mut [MaybeUninit<T>] {
+        let len = N / mem::size_of::<T>();
+
+        // SAFETY: `_align` ensures we are correctly aligned.
+        unsafe { core::slice::from_raw_parts_mut(self.storage.as_mut_ptr().cast(), len) }
+    }
+}
@@ -0,0 +1,267 @@
+//! This module contains a stable quicksort and partition implementation.
+
+use crate::intrinsics;
+use crate::mem::{self, ManuallyDrop, MaybeUninit};
+use crate::ptr;
+
+use crate::slice::sort::shared::pivot::choose_pivot;
+use crate::slice::sort::shared::smallsort::StableSmallSortTypeImpl;
+use crate::slice::sort::shared::FreezeMarker;
+
+/// Sorts `v` recursively using quicksort.
+///
+/// `limit` when initialized with `c*log(v.len())` for some c ensures we do not
+/// overflow the stack or go quadratic.
+#[inline(never)]
+pub fn quicksort<T, F: FnMut(&T, &T) -> bool>(
+    mut v: &mut [T],
+    scratch: &mut [MaybeUninit<T>],
+    mut limit: u32,
+    mut left_ancestor_pivot: Option<&T>,
+    is_less: &mut F,
+) {
+    loop {
+        let len = v.len();
+
+        if len <= T::small_sort_threshold() {
+            T::small_sort(v, scratch, is_less);
+            return;
+        }
+
+        if limit == 0 {
+            // We have had too many bad pivots, switch to O(n log n) fallback
+            // algorithm. In our case that is driftsort in eager mode.
+            crate::slice::sort::stable::drift::sort(v, scratch, true, is_less);
+            return;
+        }
+        limit -= 1;
+
+        let pivot_pos = choose_pivot(v, is_less);
+        // SAFETY: choose_pivot promises to return a valid pivot index.
+        unsafe {
+            intrinsics::assume(pivot_pos < v.len());
+        }
+
+        // SAFETY: We only access the temporary copy for Freeze types, otherwise
+        // self-modifications via `is_less` would not be observed and this would
+        // be unsound. Our temporary copy does not escape this scope.
+        let pivot_copy = unsafe { ManuallyDrop::new(ptr::read(&v[pivot_pos])) };
+        let pivot_ref = (!has_direct_interior_mutability::<T>()).then_some(&*pivot_copy);
+
+        // We choose a pivot, and check if this pivot is equal to our left
+        // ancestor. If true, we do a partition putting equal elements on the
+        // left and do not recurse on it. This gives O(n log k) sorting for k
+        // distinct values, a strategy borrowed from pdqsort. For types with
+        // interior mutability we can't soundly create a temporary copy of the
+        // ancestor pivot, and use left_partition_len == 0 as our method for
+        // detecting when we re-use a pivot, which means we do at most three
+        // partition operations with pivot p instead of the optimal two.
+        let mut perform_equal_partition = false;
+        if let Some(la_pivot) = left_ancestor_pivot {
+            perform_equal_partition = !is_less(la_pivot, &v[pivot_pos]);
+        }
+
+        let mut left_partition_len = 0;
+        if !perform_equal_partition {
+            left_partition_len = stable_partition(v, scratch, pivot_pos, false, is_less);
+            perform_equal_partition = left_partition_len == 0;
+        }
+
+        if perform_equal_partition {
+            let mid_eq = stable_partition(v, scratch, pivot_pos, true, &mut |a, b| !is_less(b, a));
+            v = &mut v[mid_eq..];
+            left_ancestor_pivot = None;
+            continue;
+        }
+
+        // Process left side with the next loop iter, right side with recursion.
+        let (left, right) = v.split_at_mut(left_partition_len);
+        quicksort(right, scratch, limit, pivot_ref, is_less);
+        v = left;
+    }
+}
+
+/// Partitions `v` using pivot `p = v[pivot_pos]` and returns the number of
+/// elements less than `p`. The relative order of elements that compare < p and
+/// those that compare >= p is preserved - it is a stable partition.
+///
+/// If `is_less` is not a strict total order or panics, `scratch.len() < v.len()`,
+/// or `pivot_pos >= v.len()`, the result and `v`'s state is sound but unspecified.
+fn stable_partition<T, F: FnMut(&T, &T) -> bool>(
+    v: &mut [T],
+    scratch: &mut [MaybeUninit<T>],
+    pivot_pos: usize,
+    pivot_goes_left: bool,
+    is_less: &mut F,
+) -> usize {
+    let len = v.len();
+
+    if intrinsics::unlikely(scratch.len() < len || pivot_pos >= len) {
+        core::intrinsics::abort()
+    }
+
+    let v_base = v.as_ptr();
+    let scratch_base = MaybeUninit::slice_as_mut_ptr(scratch);
+
+    // The core idea is to write the values that compare as less-than to the left
+    // side of `scratch`, while the values that compared as greater or equal than
+    // `v[pivot_pos]` go to the right side of `scratch` in reverse. See
+    // PartitionState for details.
+
+    // SAFETY: see individual comments.
+    unsafe {
+        // SAFETY: we made sure the scratch has length >= len and that pivot_pos
+        // is in-bounds. v and scratch are disjoint slices.
+        let pivot = v_base.add(pivot_pos);
+        let mut state = PartitionState::new(v_base, scratch_base, len);
+
+        let mut pivot_in_scratch = ptr::null_mut();
+        let mut loop_end_pos = pivot_pos;
+
+        // SAFETY: this loop is equivalent to calling state.partition_one
+        // exactly len times.
+        loop {
+            // Ideally the outer loop won't be unrolled, to save binary size,
+            // but we do want the inner loop to be unrolled for small types, as
+            // this gave significant performance boosts in benchmarks. Unrolling
+            // through for _ in 0..UNROLL_LEN { .. } instead of manually improves
+            // compile times but has a ~10-20% performance penalty on opt-level=s.
+            if const { mem::size_of::<T>() <= 16 } {
+                const UNROLL_LEN: usize = 4;
+                let unroll_end = v_base.add(loop_end_pos.saturating_sub(UNROLL_LEN - 1));
+                while state.scan < unroll_end {
+                    state.partition_one(is_less(&*state.scan, &*pivot));
+                    state.partition_one(is_less(&*state.scan, &*pivot));
+                    state.partition_one(is_less(&*state.scan, &*pivot));
+                    state.partition_one(is_less(&*state.scan, &*pivot));
+                }
+            }
+
+            let loop_end = v_base.add(loop_end_pos);
+            while state.scan < loop_end {
+                state.partition_one(is_less(&*state.scan, &*pivot));
+            }
+
+            if loop_end_pos == len {
+                break;
+            }
+
+            // We avoid comparing pivot with itself, as this could create deadlocks for
+            // certain comparison operators. We also store its location later for later.
+            pivot_in_scratch = state.partition_one(pivot_goes_left);
+
+            loop_end_pos = len;
+        }
+
+        // `pivot` must be copied into its correct position again, because a
+        // comparison operator might have modified it.
+        if has_direct_interior_mutability::<T>() {
+            ptr::copy_nonoverlapping(pivot, pivot_in_scratch, 1);
+        }
+
+        // SAFETY: partition_one being called exactly len times guarantees that scratch
+        // is initialized with a permuted copy of `v`, and that num_left <= v.len().
+        // Copying scratch[0..num_left] and scratch[num_left..v.len()] back is thus
+        // sound, as the values in scratch will never be read again, meaning our copies
+        // semantically act as moves, permuting `v`.
+
+        // Copy all the elements < p directly from swap to v.
+        let v_base = v.as_mut_ptr();
+        ptr::copy_nonoverlapping(scratch_base, v_base, state.num_left);
+
+        // Copy the elements >= p in reverse order.
+        for i in 0..len - state.num_left {
+            ptr::copy_nonoverlapping(
+                scratch_base.add(len - 1 - i),
+                v_base.add(state.num_left + i),
+                1,
+            );
+        }
+
+        state.num_left
+    }
+}
+
+struct PartitionState<T> {
+    // The start of the scratch auxiliary memory.
+    scratch_base: *mut T,
+    // The current element that is being looked at, scans left to right through slice.
+    scan: *const T,
+    // Counts the number of elements that went to the left side, also works around:
+    // https://github.com/rust-lang/rust/issues/117128
+    num_left: usize,
+    // Reverse scratch output pointer.
+    scratch_rev: *mut T,
+}
+
+impl<T> PartitionState<T> {
+    /// # Safety
+    /// scan and scratch must point to valid disjoint buffers of length len. The
+    /// scan buffer must be initialized.
+    unsafe fn new(scan: *const T, scratch: *mut T, len: usize) -> Self {
+        // SAFETY: See function safety comment.
+        unsafe { Self { scratch_base: scratch, scan, num_left: 0, scratch_rev: scratch.add(len) } }
+    }
+
+    /// Depending on the value of `towards_left` this function will write a value
+    /// to the growing left or right side of the scratch memory. This forms the
+    /// branchless core of the partition.
+    ///
+    /// # Safety
+    /// This function may be called at most `len` times. If it is called exactly
+    /// `len` times the scratch buffer then contains a copy of each element from
+    /// the scan buffer exactly once - a permutation, and num_left <= len.
+    unsafe fn partition_one(&mut self, towards_left: bool) -> *mut T {
+        // SAFETY: see individual comments.
+        unsafe {
+            // SAFETY: in-bounds because this function is called at most len times, and thus
+            // right now is incremented at most len - 1 times. Similarly, num_left < len and
+            // num_right < len, where num_right == i - num_left at the start of the ith
+            // iteration (zero-indexed).
+            self.scratch_rev = self.scratch_rev.sub(1);
+
+            // SAFETY: now we have scratch_rev == base + len - (i + 1). This means
+            // scratch_rev + num_left == base + len - 1 - num_right < base + len.
+            let dst_base = if towards_left { self.scratch_base } else { self.scratch_rev };
+            let dst = dst_base.add(self.num_left);
+            ptr::copy_nonoverlapping(self.scan, dst, 1);
+
+            self.num_left += towards_left as usize;
+            self.scan = self.scan.add(1);
+            dst
+        }
+    }
+}
+
+#[const_trait]
+trait IsFreeze {
+    fn is_freeze() -> bool;
+}
+
+impl<T> const IsFreeze for T {
+    default fn is_freeze() -> bool {
+        false
+    }
+}
+impl<T: FreezeMarker> const IsFreeze for T {
+    fn is_freeze() -> bool {
+        true
+    }
+}
+
+#[must_use]
+const fn has_direct_interior_mutability<T>() -> bool {
+    // If a type has interior mutability it may alter itself during comparison
+    // in a way that must be preserved after the sort operation concludes.
+    // Otherwise a type like Mutex<Option<Box<str>>> could lead to double free.
+    !T::is_freeze()
+}
+
+#[test]
+fn freeze_check() {
+    assert!(!has_direct_interior_mutability::<u32>());
+    assert!(!has_direct_interior_mutability::<[u128; 2]>());
+
+    assert!(has_direct_interior_mutability::<crate::cell::Cell<u32>>());
+    assert!(has_direct_interior_mutability::<crate::sync::Mutex<u32>>());
+}
@@ -0,0 +1,80 @@
+//! This module contains a branchless heapsort as fallback for unstable quicksort.
+
+use crate::intrinsics;
+use crate::ptr;
+
+/// Sorts `v` using heapsort, which guarantees *O*(*n* \* log(*n*)) worst-case.
+///
+/// Never inline this, it sits the main hot-loop in `recurse` and is meant as unlikely algorithmic
+/// fallback.
+///
+/// SAFETY: The caller has to guarantee that `v.len()` >= 2.
+#[inline(never)]
+pub(crate) unsafe fn heapsort<T, F>(v: &mut [T], is_less: &mut F)
+where
+    F: FnMut(&T, &T) -> bool,
+{
+    // SAFETY: See function safety.
+    unsafe {
+        intrinsics::assume(v.len() >= 2);
+
+        // Build the heap in linear time.
+        for i in (0..v.len() / 2).rev() {
+            sift_down(v, i, is_less);
+        }
+
+        // Pop maximal elements from the heap.
+        for i in (1..v.len()).rev() {
+            v.swap(0, i);
+            sift_down(&mut v[..i], 0, is_less);
+        }
+    }
+}
+
+// This binary heap respects the invariant `parent >= child`.
+//
+// SAFETY: The caller has to guarantee that node < `v.len()`.
+#[inline(never)]
+unsafe fn sift_down<T, F>(v: &mut [T], mut node: usize, is_less: &mut F)
+where
+    F: FnMut(&T, &T) -> bool,
+{
+    // SAFETY: See function safety.
+    unsafe {
+        intrinsics::assume(node < v.len());
+    }
+
+    let len = v.len();
+
+    let v_base = v.as_mut_ptr();
+
+    loop {
+        // Children of `node`.
+        let mut child = 2 * node + 1;
+        if child >= len {
+            break;
+        }
+
+        // SAFETY: The invariants and checks guarantee that both node and child are in-bounds.
+        unsafe {
+            // Choose the greater child.
+            if child + 1 < len {
+                // We need a branch to be sure not to out-of-bounds index,
+                // but it's highly predictable.  The comparison, however,
+                // is better done branchless, especially for primitives.
+                child += is_less(&*v_base.add(child), &*v_base.add(child + 1)) as usize;
+            }
+
+            // Stop if the invariant holds at `node`.
+            if !is_less(&*v_base.add(node), &*v_base.add(child)) {
+                break;
+            }
+
+            // Swap `node` with the greater child, move one step down, and continue sifting. This
+            // could be ptr::swap_nonoverlapping but that adds a significant amount of binary-size.
+            ptr::swap(v_base.add(node), v_base.add(child));
+        }
+
+        node = child;
+    }
+}
@@ -0,0 +1,76 @@
+//! This module contains the entry points for `slice::sort_unstable`.
+
+use crate::intrinsics;
+use crate::mem::SizedTypeProperties;
+
+use crate::slice::sort::shared::find_existing_run;
+use crate::slice::sort::shared::smallsort::insertion_sort_shift_left;
+
+pub(crate) mod heapsort;
+pub(crate) mod quicksort;
+
+/// Unstable sort called ipnsort by Lukas Bergdoll.
+/// Design document:
+/// https://github.com/Voultapher/sort-research-rs/blob/main/writeup/ipnsort_introduction/text.md
+///
+/// Upholds all safety properties outlined here:
+/// https://github.com/Voultapher/sort-research-rs/blob/main/writeup/sort_safety/text.md
+#[inline(always)]
+pub fn sort<T, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F) {
+    // Arrays of zero-sized types are always all-equal, and thus sorted.
+    if T::IS_ZST {
+        return;
+    }
+
+    // Instrumenting the standard library showed that 90+% of the calls to sort
+    // by rustc are either of size 0 or 1.
+    let len = v.len();
+    if intrinsics::likely(len < 2) {
+        return;
+    }
+
+    // More advanced sorting methods than insertion sort are faster if called in
+    // a hot loop for small inputs, but for general-purpose code the small
+    // binary size of insertion sort is more important. The instruction cache in
+    // modern processors is very valuable, and for a single sort call in general
+    // purpose code any gains from an advanced method are cancelled by i-cache
+    // misses during the sort, and thrashing the i-cache for surrounding code.
+    const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
+    if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
+        insertion_sort_shift_left(v, 1, is_less);
+        return;
+    }
+
+    ipnsort(v, is_less);
+}
+
+/// See [`sort`]
+///
+/// Deliberately don't inline the main sorting routine entrypoint to ensure the
+/// inlined insertion sort i-cache footprint remains minimal.
+#[inline(never)]
+fn ipnsort<T, F>(v: &mut [T], is_less: &mut F)
+where
+    F: FnMut(&T, &T) -> bool,
+{
+    let len = v.len();
+    let (run_len, was_reversed) = find_existing_run(v, is_less);
+
+    // SAFETY: find_existing_run promises to return a valid run_len.
+    unsafe { intrinsics::assume(run_len <= len) };
+
+    if run_len == len {
+        if was_reversed {
+            v.reverse();
+        }
+
+        // It would be possible to a do in-place merging here for a long existing streak. But that
+        // makes the implementation a lot bigger, users can use `slice::sort` for that use-case.
+        return;
+    }
+
+    // Limit the number of imbalanced partitions to `2 * floor(log2(len))`.
+    // The binary OR by one is used to eliminate the zero-check in the logarithm.
+    let limit = 2 * (len | 1).ilog2();
+    crate::slice::sort::unstable::quicksort::quicksort(v, None, limit, is_less);
+}
@@ -0,0 +1,347 @@
+//! This module contains an unstable quicksort and two partition implementations.
+
+use crate::intrinsics;
+use crate::mem::{self, ManuallyDrop};
+use crate::ptr;
+
+use crate::slice::sort::shared::pivot::choose_pivot;
+use crate::slice::sort::shared::smallsort::UnstableSmallSortTypeImpl;
+
+/// Sorts `v` recursively.
+///
+/// If the slice had a predecessor in the original array, it is specified as `ancestor_pivot`.
+///
+/// `limit` is the number of allowed imbalanced partitions before switching to `heapsort`. If zero,
+/// this function will immediately switch to heapsort.
+pub(crate) fn quicksort<'a, T, F>(
+    mut v: &'a mut [T],
+    mut ancestor_pivot: Option<&'a T>,
+    mut limit: u32,
+    is_less: &mut F,
+) where
+    F: FnMut(&T, &T) -> bool,
+{
+    loop {
+        if v.len() <= T::small_sort_threshold() {
+            T::small_sort(v, is_less);
+            return;
+        }
+
+        // If too many bad pivot choices were made, simply fall back to heapsort in order to
+        // guarantee `O(N x log(N))` worst-case.
+        if limit == 0 {
+            // SAFETY: We assume the `small_sort` threshold is at least 1.
+            unsafe {
+                crate::slice::sort::unstable::heapsort::heapsort(v, is_less);
+            }
+            return;
+        }
+
+        limit -= 1;
+
+        // Choose a pivot and try guessing whether the slice is already sorted.
+        let pivot_pos = choose_pivot(v, is_less);
+
+        // If the chosen pivot is equal to the predecessor, then it's the smallest element in the
+        // slice. Partition the slice into elements equal to and elements greater than the pivot.
+        // This case is usually hit when the slice contains many duplicate elements.
+        if let Some(p) = ancestor_pivot {
+            // SAFETY: We assume choose_pivot yields an in-bounds position.
+            if !is_less(p, unsafe { v.get_unchecked(pivot_pos) }) {
+                let num_lt = partition(v, pivot_pos, &mut |a, b| !is_less(b, a));
+
+                // Continue sorting elements greater than the pivot. We know that `num_lt` contains
+                // the pivot. So we can continue after `num_lt`.
+                v = &mut v[(num_lt + 1)..];
+                ancestor_pivot = None;
+                continue;
+            }
+        }
+
+        // Partition the slice.
+        let num_lt = partition(v, pivot_pos, is_less);
+        // SAFETY: partition ensures that `num_lt` will be in-bounds.
+        unsafe { intrinsics::assume(num_lt < v.len()) };
+
+        // Split the slice into `left`, `pivot`, and `right`.
+        let (left, right) = v.split_at_mut(num_lt);
+        let (pivot, right) = right.split_at_mut(1);
+        let pivot = &pivot[0];
+
+        // Recurse into the left side. We have a fixed recursion limit, testing shows no real
+        // benefit for recursing into the shorter side.
+        quicksort(left, ancestor_pivot, limit, is_less);
+
+        // Continue with the right side.
+        v = right;
+        ancestor_pivot = Some(pivot);
+    }
+}
+
+/// Takes the input slice `v` and re-arranges elements such that when the call returns normally
+/// all elements that compare true for `is_less(elem, pivot)` where `pivot == v[pivot_pos]` are
+/// on the left side of `v` followed by the other elements, notionally considered greater or
+/// equal to `pivot`.
+///
+/// Returns the number of elements that are compared true for `is_less(elem, pivot)`.
+///
+/// If `is_less` does not implement a total order the resulting order and return value are
+/// unspecified. All original elements will remain in `v` and any possible modifications via
+/// interior mutability will be observable. Same is true if `is_less` panics or `v.len()`
+/// exceeds `scratch.len()`.
+pub(crate) fn partition<T, F>(v: &mut [T], pivot: usize, is_less: &mut F) -> usize
+where
+    F: FnMut(&T, &T) -> bool,
+{
+    let len = v.len();
+
+    // Allows for panic-free code-gen by proving this property to the compiler.
+    if len == 0 {
+        return 0;
+    }
+
+    // Allows for panic-free code-gen by proving this property to the compiler.
+    if pivot >= len {
+        intrinsics::abort();
+    }
+
+    // Place the pivot at the beginning of slice.
+    v.swap(0, pivot);
+    let (pivot, v_without_pivot) = v.split_at_mut(1);
+
+    // Assuming that Rust generates noalias LLVM IR we can be sure that a partition function
+    // signature of the form `(v: &mut [T], pivot: &T)` guarantees that pivot and v can't alias.
+    // Having this guarantee is crucial for optimizations. It's possible to copy the pivot value
+    // into a stack value, but this creates issues for types with interior mutability mandating
+    // a drop guard.
+    let pivot = &mut pivot[0];
+
+    // This construct is used to limit the LLVM IR generated, which saves large amounts of
+    // compile-time by only instantiating the code that is needed. Idea by Frank Steffahn.
+    let num_lt = (const { inst_partition::<T, F>() })(v_without_pivot, pivot, is_less);
+
+    // Place the pivot between the two partitions.
+    v.swap(0, num_lt);
+
+    num_lt
+}
+
+const fn inst_partition<T, F: FnMut(&T, &T) -> bool>() -> fn(&mut [T], &T, &mut F) -> usize {
+    const MAX_BRANCHLESS_PARTITION_SIZE: usize = 96;
+    if mem::size_of::<T>() <= MAX_BRANCHLESS_PARTITION_SIZE {
+        // Specialize for types that are relatively cheap to copy, where branchless optimizations
+        // have large leverage e.g. `u64` and `String`.
+        partition_lomuto_branchless_cyclic::<T, F>
+    } else {
+        partition_hoare_branchy_cyclic::<T, F>
+    }
+}
+
+/// See [`partition`].
+fn partition_hoare_branchy_cyclic<T, F>(v: &mut [T], pivot: &T, is_less: &mut F) -> usize
+where
+    F: FnMut(&T, &T) -> bool,
+{
+    let len = v.len();
+
+    if len == 0 {
+        return 0;
+    }
+
+    // Optimized for large types that are expensive to move. Not optimized for integers. Optimized
+    // for small code-gen, assuming that is_less is an expensive operation that generates
+    // substantial amounts of code or a call. And that copying elements will likely be a call to
+    // memcpy. Using 2 `ptr::copy_nonoverlapping` has the chance to be faster than
+    // `ptr::swap_nonoverlapping` because `memcpy` can use wide SIMD based on runtime feature
+    // detection. Benchmarks support this analysis.
+
+    let mut gap_opt: Option<GapGuard<T>> = None;
+
+    // SAFETY: The left-to-right scanning loop performs a bounds check, where we know that `left >=
+    // v_base && left < right && right <= v_base.add(len)`. The right-to-left scanning loop performs
+    // a bounds check ensuring that `right` is in-bounds. We checked that `len` is more than zero,
+    // which means that unconditional `right = right.sub(1)` is safe to do. The exit check makes
+    // sure that `left` and `right` never alias, making `ptr::copy_nonoverlapping` safe. The
+    // drop-guard `gap` ensures that should `is_less` panic we always overwrite the duplicate in the
+    // input. `gap.pos` stores the previous value of `right` and starts at `right` and so it too is
+    // in-bounds. We never pass the saved `gap.value` to `is_less` while it is inside the `GapGuard`
+    // thus any changes via interior mutability will be observed.
+    unsafe {
+        let v_base = v.as_mut_ptr();
+
+        let mut left = v_base;
+        let mut right = v_base.add(len);
+
+        loop {
+            // Find the first element greater than the pivot.
+            while left < right && is_less(&*left, pivot) {
+                left = left.add(1);
+            }
+
+            // Find the last element equal to the pivot.
+            loop {
+                right = right.sub(1);
+                if left >= right || is_less(&*right, pivot) {
+                    break;
+                }
+            }
+
+            if left >= right {
+                break;
+            }
+
+            // Swap the found pair of out-of-order elements via cyclic permutation.
+            let is_first_swap_pair = gap_opt.is_none();
+
+            if is_first_swap_pair {
+                gap_opt = Some(GapGuard { pos: right, value: ManuallyDrop::new(ptr::read(left)) });
+            }
+
+            let gap = gap_opt.as_mut().unwrap_unchecked();
+
+            // Single place where we instantiate ptr::copy_nonoverlapping in the partition.
+            if !is_first_swap_pair {
+                ptr::copy_nonoverlapping(left, gap.pos, 1);
+            }
+            gap.pos = right;
+            ptr::copy_nonoverlapping(right, left, 1);
+
+            left = left.add(1);
+        }
+
+        left.sub_ptr(v_base)
+
+        // `gap_opt` goes out of scope and overwrites the last wrong-side element on the right side
+        // with the first wrong-side element of the left side that was initially overwritten by the
+        // first wrong-side element on the right side element.
+    }
+}
+
+struct PartitionState<T> {
+    // The current element that is being looked at, scans left to right through slice.
+    right: *mut T,
+    // Counts the number of elements that compared less-than, also works around:
+    // https://github.com/rust-lang/rust/issues/117128
+    num_lt: usize,
+    // Gap guard that tracks the temporary duplicate in the input.
+    gap: GapGuardRaw<T>,
+}
+
+fn partition_lomuto_branchless_cyclic<T, F>(v: &mut [T], pivot: &T, is_less: &mut F) -> usize
+where
+    F: FnMut(&T, &T) -> bool,
+{
+    // Novel partition implementation by Lukas Bergdoll and Orson Peters. Branchless Lomuto
+    // partition paired with a cyclic permutation.
+    // https://github.com/Voultapher/sort-research-rs/blob/main/writeup/lomcyc_partition/text.md
+
+    let len = v.len();
+    let v_base = v.as_mut_ptr();
+
+    if len == 0 {
+        return 0;
+    }
+
+    // SAFETY: We checked that `len` is more than zero, which means that reading `v_base` is safe to
+    // do. From there we have a bounded loop where `v_base.add(i)` is guaranteed in-bounds. `v` and
+    // `pivot` can't alias because of type system rules. The drop-guard `gap` ensures that should
+    // `is_less` panic we always overwrite the duplicate in the input. `gap.pos` stores the previous
+    // value of `right` and starts at `v_base` and so it too is in-bounds. Given `UNROLL_LEN == 2`
+    // after the main loop we either have A) the last element in `v` that has not yet been processed
+    // because `len % 2 != 0`, or B) all elements have been processed except the gap value that was
+    // saved at the beginning with `ptr::read(v_base)`. In the case A) the loop will iterate twice,
+    // first performing loop_body to take care of the last element that didn't fit into the unroll.
+    // After that the behavior is the same as for B) where we use the saved value as `right` to
+    // overwrite the duplicate. If this very last call to `is_less` panics the saved value will be
+    // copied back including all possible changes via interior mutability. If `is_less` does not
+    // panic and the code continues we overwrite the duplicate and do `right = right.add(1)`, this
+    // is safe to do with `&mut *gap.value` because `T` is the same as `[T; 1]` and generating a
+    // pointer one past the allocation is safe.
+    unsafe {
+        let mut loop_body = |state: &mut PartitionState<T>| {
+            let right_is_lt = is_less(&*state.right, pivot);
+            let left = v_base.add(state.num_lt);
+
+            ptr::copy(left, state.gap.pos, 1);
+            ptr::copy_nonoverlapping(state.right, left, 1);
+
+            state.gap.pos = state.right;
+            state.num_lt += right_is_lt as usize;
+
+            state.right = state.right.add(1);
+        };
+
+        // Ideally we could just use GapGuard in PartitionState, but the reference that is
+        // materialized with `&mut state` when calling `loop_body` would create a mutable reference
+        // to the parent struct that contains the gap value, invalidating the reference pointer
+        // created from a reference to the gap value in the cleanup loop. This is only an issue
+        // under Stacked Borrows, Tree Borrows accepts the intuitive code using GapGuard as valid.
+        let mut gap_value = ManuallyDrop::new(ptr::read(v_base));
+
+        let mut state = PartitionState {
+            num_lt: 0,
+            right: v_base.add(1),
+
+            gap: GapGuardRaw { pos: v_base, value: &mut *gap_value },
+        };
+
+        // Manual unrolling that works well on x86, Arm and with opt-level=s without murdering
+        // compile-times. Leaving this to the compiler yields ok to bad results.
+        let unroll_len = const { if mem::size_of::<T>() <= 16 { 2 } else { 1 } };
+
+        let unroll_end = v_base.add(len - (unroll_len - 1));
+        while state.right < unroll_end {
+            if unroll_len == 2 {
+                loop_body(&mut state);
+                loop_body(&mut state);
+            } else {
+                loop_body(&mut state);
+            }
+        }
+
+        // Single instantiate `loop_body` for both the unroll cleanup and cyclic permutation
+        // cleanup. Optimizes binary-size and compile-time.
+        let end = v_base.add(len);
+        loop {
+            let is_done = state.right == end;
+            state.right = if is_done { state.gap.value } else { state.right };
+
+            loop_body(&mut state);
+
+            if is_done {
+                mem::forget(state.gap);
+                break;
+            }
+        }
+
+        state.num_lt
+    }
+}
+
+struct GapGuard<T> {
+    pos: *mut T,
+    value: ManuallyDrop<T>,
+}
+
+impl<T> Drop for GapGuard<T> {
+    fn drop(&mut self) {
+        unsafe {
+            ptr::copy_nonoverlapping(&*self.value, self.pos, 1);
+        }
+    }
+}
+
+/// Ideally this wouldn't be needed and we could just use the regular GapGuard.
+/// See comment in [`partition_lomuto_branchless_cyclic`].
+struct GapGuardRaw<T> {
+    pos: *mut T,
+    value: *mut T,
+}
+
+impl<T> Drop for GapGuardRaw<T> {
+    fn drop(&mut self) {
+        unsafe {
+            ptr::copy_nonoverlapping(self.value, self.pos, 1);
+        }
+    }
+}
@@ -49,7 +49,6 @@
 #![feature(is_sorted)]
 #![feature(layout_for_ptr)]
 #![feature(pattern)]
-#![feature(sort_internals)]
 #![feature(slice_take)]
 #![feature(slice_from_ptr_range)]
 #![feature(slice_split_once)]
@@ -1803,9 +1803,8 @@ fn brute_force_rotate_test_1() {
 #[test]
 #[cfg(not(target_arch = "wasm32"))]
 fn sort_unstable() {
-    use core::cmp::Ordering::{Equal, Greater, Less};
-    use core::slice::heapsort;
-    use rand::{seq::SliceRandom, Rng};
+    // use core::cmp::Ordering::{Equal, Greater, Less};
+    use rand::Rng;

    // Miri is too slow (but still need to `chain` to make the types match)
    let lens = if cfg!(miri) { (2..20).chain(0..0) } else { (2..25).chain(500..510) };
@@ -1839,31 +1838,10 @@ fn sort_unstable() {
                tmp.copy_from_slice(v);
                tmp.sort_unstable_by(|a, b| b.cmp(a));
                assert!(tmp.windows(2).all(|w| w[0] >= w[1]));
-
-                // Test heapsort using `<` operator.
-                tmp.copy_from_slice(v);
-                heapsort(tmp, |a, b| a < b);
-                assert!(tmp.windows(2).all(|w| w[0] <= w[1]));
-
-                // Test heapsort using `>` operator.
-                tmp.copy_from_slice(v);
-                heapsort(tmp, |a, b| a > b);
-                assert!(tmp.windows(2).all(|w| w[0] >= w[1]));
            }
        }
    }

-    // Sort using a completely random comparison function.
-    // This will reorder the elements *somehow*, but won't panic.
-    for i in 0..v.len() {
-        v[i] = i as i32;
-    }
-    v.sort_unstable_by(|_, _| *[Less, Equal, Greater].choose(&mut rng).unwrap());
-    v.sort_unstable();
-    for i in 0..v.len() {
-        assert_eq!(v[i], i as i32);
-    }
-
    // Should not panic.
    [0i32; 0].sort_unstable();
    [(); 10].sort_unstable();