mirror of
https://github.com/rust-lang/rust.git
synced 2026-05-23 02:27:39 +03:00
Auto merge of #153640 - Zalathar:subchunk, r=wesleywiser
Make bitset `would_modify_words` more vectorizer-friendly Currently this function compares a single pair of `u64` at a time, which is potentially slower than comparing multiple words before each early-exit check, especially for the large chunks used by ChunkedBitSet. Perf shows a notable improvement in `cranelift-codegen`, which is the one benchmark that is known to stress these code paths. - Incorporates https://github.com/rust-lang/rust/pull/153759.
This commit is contained in:
@@ -319,7 +319,7 @@ pub fn union_not(&mut self, other: &DenseBitSet<T>) {
|
||||
// quickly and accurately detect whether the update changed anything.
|
||||
// But that's only worth doing if there's an actual use-case.
|
||||
|
||||
bitwise(&mut self.words, &other.words, |a, b| a | !b);
|
||||
update_words(&mut self.words, &other.words, |a, b| a | !b);
|
||||
// The bitwise update `a | !b` can result in the last word containing
|
||||
// out-of-domain bits, so we need to clear them.
|
||||
self.clear_excess_bits();
|
||||
@@ -330,17 +330,17 @@ pub fn union_not(&mut self, other: &DenseBitSet<T>) {
|
||||
impl<T: Idx> BitRelations<DenseBitSet<T>> for DenseBitSet<T> {
|
||||
fn union(&mut self, other: &DenseBitSet<T>) -> bool {
|
||||
assert_eq!(self.domain_size, other.domain_size);
|
||||
bitwise(&mut self.words, &other.words, |a, b| a | b)
|
||||
update_words(&mut self.words, &other.words, |a, b| a | b)
|
||||
}
|
||||
|
||||
fn subtract(&mut self, other: &DenseBitSet<T>) -> bool {
|
||||
assert_eq!(self.domain_size, other.domain_size);
|
||||
bitwise(&mut self.words, &other.words, |a, b| a & !b)
|
||||
update_words(&mut self.words, &other.words, |a, b| a & !b)
|
||||
}
|
||||
|
||||
fn intersect(&mut self, other: &DenseBitSet<T>) -> bool {
|
||||
assert_eq!(self.domain_size, other.domain_size);
|
||||
bitwise(&mut self.words, &other.words, |a, b| a & b)
|
||||
update_words(&mut self.words, &other.words, |a, b| a & b)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -787,7 +787,7 @@ fn union(&mut self, other: &ChunkedBitSet<T>) -> bool {
|
||||
// Do a more precise "will anything change?" test. Also a
|
||||
// performance win.
|
||||
let op = |a, b| a | b;
|
||||
if !bitwise_changes(
|
||||
if !would_modify_words(
|
||||
&self_chunk_words[0..num_words],
|
||||
&other_chunk_words[0..num_words],
|
||||
op,
|
||||
@@ -797,7 +797,7 @@ fn union(&mut self, other: &ChunkedBitSet<T>) -> bool {
|
||||
|
||||
// If we reach here, `self_chunk_words` is definitely changing.
|
||||
let self_chunk_words = Rc::make_mut(self_chunk_words);
|
||||
let has_changed = bitwise(
|
||||
let has_changed = update_words(
|
||||
&mut self_chunk_words[0..num_words],
|
||||
&other_chunk_words[0..num_words],
|
||||
op,
|
||||
@@ -865,7 +865,7 @@ fn subtract(&mut self, other: &ChunkedBitSet<T>) -> bool {
|
||||
// See `ChunkedBitSet::union` for details on what is happening here.
|
||||
let num_words = num_words(*chunk_domain_size as usize);
|
||||
let op = |a: Word, b: Word| a & !b;
|
||||
if !bitwise_changes(
|
||||
if !would_modify_words(
|
||||
&self_chunk_words[0..num_words],
|
||||
&other_chunk_words[0..num_words],
|
||||
op,
|
||||
@@ -874,7 +874,7 @@ fn subtract(&mut self, other: &ChunkedBitSet<T>) -> bool {
|
||||
}
|
||||
|
||||
let self_chunk_words = Rc::make_mut(self_chunk_words);
|
||||
let has_changed = bitwise(
|
||||
let has_changed = update_words(
|
||||
&mut self_chunk_words[0..num_words],
|
||||
&other_chunk_words[0..num_words],
|
||||
op,
|
||||
@@ -914,7 +914,7 @@ fn intersect(&mut self, other: &ChunkedBitSet<T>) -> bool {
|
||||
// See `ChunkedBitSet::union` for details on what is happening here.
|
||||
let num_words = num_words(*chunk_domain_size as usize);
|
||||
let op = |a, b| a & b;
|
||||
if !bitwise_changes(
|
||||
if !would_modify_words(
|
||||
&self_chunk_words[0..num_words],
|
||||
&other_chunk_words[0..num_words],
|
||||
op,
|
||||
@@ -923,7 +923,7 @@ fn intersect(&mut self, other: &ChunkedBitSet<T>) -> bool {
|
||||
}
|
||||
|
||||
let self_chunk_words = Rc::make_mut(self_chunk_words);
|
||||
let has_changed = bitwise(
|
||||
let has_changed = update_words(
|
||||
&mut self_chunk_words[0..num_words],
|
||||
&other_chunk_words[0..num_words],
|
||||
op,
|
||||
@@ -1052,10 +1052,10 @@ fn fmt(&self, w: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
}
|
||||
}
|
||||
|
||||
/// Sets `out_vec[i] = op(out_vec[i], in_vec[i])` for each index `i` in both
|
||||
/// Sets `lhs[i] = op(lhs[i], rhs[i])` for each index `i` in both
|
||||
/// slices. The slices must have the same length.
|
||||
///
|
||||
/// Returns true if at least one bit in `out_vec` was changed.
|
||||
/// Returns true if at least one bit in `lhs` was changed.
|
||||
///
|
||||
/// ## Warning
|
||||
/// Some bitwise operations (e.g. union-not, xor) can set output bits that were
|
||||
@@ -1065,16 +1065,16 @@ fn fmt(&self, w: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
/// "changed" return value unreliable, because the change might have only
|
||||
/// affected excess bits.
|
||||
#[inline]
|
||||
fn bitwise<Op>(out_vec: &mut [Word], in_vec: &[Word], op: Op) -> bool
|
||||
fn update_words<Op>(lhs: &mut [Word], rhs: &[Word], op: Op) -> bool
|
||||
where
|
||||
Op: Fn(Word, Word) -> Word,
|
||||
{
|
||||
assert_eq!(out_vec.len(), in_vec.len());
|
||||
assert_eq!(lhs.len(), rhs.len());
|
||||
let mut changed = 0;
|
||||
for (out_elem, in_elem) in iter::zip(out_vec, in_vec) {
|
||||
let old_val = *out_elem;
|
||||
let new_val = op(old_val, *in_elem);
|
||||
*out_elem = new_val;
|
||||
for (lhs_slot, &rhs_val) in iter::zip(lhs, rhs) {
|
||||
let old_val = *lhs_slot;
|
||||
let new_val = op(old_val, rhs_val);
|
||||
*lhs_slot = new_val;
|
||||
// This is essentially equivalent to a != with changed being a bool, but
|
||||
// in practice this code gets auto-vectorized by the compiler for most
|
||||
// operators. Using != here causes us to generate quite poor code as the
|
||||
@@ -1084,21 +1084,40 @@ fn bitwise<Op>(out_vec: &mut [Word], in_vec: &[Word], op: Op) -> bool
|
||||
changed != 0
|
||||
}
|
||||
|
||||
/// Does this bitwise operation change `out_vec`?
|
||||
/// Returns true if a call to [`update_words`] would modify `lhs`, i.e.
|
||||
/// `lhs[i] != op(lhs[i], rhs[i])` for some `i`.
|
||||
#[inline]
|
||||
fn bitwise_changes<Op>(out_vec: &[Word], in_vec: &[Word], op: Op) -> bool
|
||||
fn would_modify_words<Op>(lhs: &[Word], rhs: &[Word], op: Op) -> bool
|
||||
where
|
||||
Op: Fn(Word, Word) -> Word,
|
||||
{
|
||||
assert_eq!(out_vec.len(), in_vec.len());
|
||||
for (out_elem, in_elem) in iter::zip(out_vec, in_vec) {
|
||||
let old_val = *out_elem;
|
||||
let new_val = op(old_val, *in_elem);
|
||||
if old_val != new_val {
|
||||
assert_eq!(lhs.len(), rhs.len());
|
||||
|
||||
// To make codegen more vectorizer-friendly, we traverse each slice in larger
|
||||
// "subchunks", and only consider an early return at subchunk boundaries.
|
||||
// These subchunks are smaller than full `ChunkedBitSet` chunks, so that
|
||||
// we still have some chance of stopping early.
|
||||
const SUBCHUNK_LEN: usize = 64 / size_of::<Word>();
|
||||
let (lhs_chunks, lhs_tail) = lhs.as_chunks::<SUBCHUNK_LEN>();
|
||||
let (rhs_chunks, rhs_tail) = rhs.as_chunks::<SUBCHUNK_LEN>();
|
||||
|
||||
let would_modify_subchunk = |lhs_chunk: &[Word], rhs_chunk: &[Word]| {
|
||||
let mut changed = 0;
|
||||
for (&old_val, &rhs_val) in iter::zip(lhs_chunk, rhs_chunk) {
|
||||
let new_val = op(old_val, rhs_val);
|
||||
// Set `changed` to a non-zero value if any bits changed.
|
||||
// This gives better SIMD codegen than using an actual boolean.
|
||||
changed |= old_val ^ new_val;
|
||||
}
|
||||
changed != 0
|
||||
};
|
||||
|
||||
for (lhs_chunk, rhs_chunk) in iter::zip(lhs_chunks, rhs_chunks) {
|
||||
if would_modify_subchunk(lhs_chunk, rhs_chunk) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
false
|
||||
would_modify_subchunk(lhs_tail, rhs_tail)
|
||||
}
|
||||
|
||||
/// A bitset with a mixed representation, using `DenseBitSet` for small and
|
||||
@@ -1499,7 +1518,7 @@ pub fn union_row_with(&mut self, with: &DenseBitSet<C>, write: R) -> bool {
|
||||
assert!(write.index() < self.num_rows);
|
||||
assert_eq!(with.domain_size(), self.num_columns);
|
||||
let (write_start, write_end) = self.range(write);
|
||||
bitwise(&mut self.words[write_start..write_end], &with.words, |a, b| a | b)
|
||||
update_words(&mut self.words[write_start..write_end], &with.words, |a, b| a | b)
|
||||
}
|
||||
|
||||
/// Sets every cell in `row` to true.
|
||||
|
||||
Reference in New Issue
Block a user