Merge pull request #4962 from elichai/elichai/_mm512_permutexvar_epi64

Implement `_mm512_permutexvar_epi64` shim
This commit is contained in:
Ralf Jung
2026-04-24 15:47:58 +00:00
committed by GitHub
3 changed files with 44 additions and 7 deletions
+2 -2
View File
@@ -104,8 +104,8 @@ fn emulate_x86_avx512_intrinsic(
pmaddbw(this, left, right, dest)?;
}
// Used to implement the _mm512_permutexvar_epi32 function.
"permvar.si.512" => {
// Used to implement the _mm512_permutexvar_epi32/_mm512_permutexvar_epi64 functions.
"permvar.si.512" | "permvar.di.512" => {
let [left, right] =
this.check_shim_sig_lenient(abi, CanonAbi::C, link_name, args)?;
+18 -5
View File
@@ -1056,12 +1056,22 @@ fn pmaddbw<'tcx>(
interp_ok(())
}
/// Shuffle 32-bit integers in `values` across lanes using the corresponding
/// index in `indices`, and store the results in dst.
/// Shuffle elements in `values` across lanes using the corresponding index in
/// `indices`, and store the results in `dest`.
///
/// This helper is shared by both the 32-bit-lane and 64-bit-lane AVX
/// permute-by-index intrinsics. The element type is taken from `values` and
/// `dest`, while the index lanes are interpreted at their full width (`i32` or
/// `i64`, depending on the intrinsic).
///
/// For a vector with `N` lanes, only the low `log2(N)` bits of each index are
/// used. Equivalently, lane `i` of the result is copied from
/// `values[indices[i] & (N - 1)]`.
///
/// <https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_epi32>
/// <https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_ps>
/// <https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutexvar_epi32>
/// <https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutexvar_epi64>
fn permute<'tcx>(
ecx: &mut crate::MiriInterpCx<'tcx>,
values: &OpTy<'tcx>,
@@ -1075,18 +1085,21 @@ fn permute<'tcx>(
// fn permd(a: u32x8, b: u32x8) -> u32x8;
// fn permps(a: __m256, b: i32x8) -> __m256;
// fn vpermd(a: i32x16, idx: i32x16) -> i32x16;
// fn vpermq(a: i64x8, b: i64x8) -> i64x8;
assert_eq!(dest_len, values_len);
assert_eq!(dest_len, indices_len);
// Only use the lower 3 bits to index into a vector with 8 lanes,
// or the lower 4 bits when indexing into a 16-lane vector.
assert!(dest_len.is_power_of_two());
let mask = u32::try_from(dest_len).unwrap().strict_sub(1);
let mask = u128::from(dest_len).strict_sub(1);
for i in 0..dest_len {
let dest = ecx.project_index(&dest, i)?;
let index = ecx.read_scalar(&ecx.project_index(&indices, i)?)?.to_u32()?;
let element = ecx.project_index(&values, (index & mask).into())?;
let index_place = ecx.project_index(&indices, i)?;
let index = ecx.read_scalar(&index_place)?.to_uint(index_place.layout.size)?;
// `mask` is at most `dest_len - 1` which fits in a `u64`, so this cannot fail.
let element = ecx.project_index(&values, u64::try_from(index & mask).unwrap())?;
ecx.copy_op(&element, &dest)?;
}
@@ -219,6 +219,30 @@ unsafe fn test_mm512_permutexvar_epi32() {
}
test_mm512_permutexvar_epi32();
#[target_feature(enable = "avx512f")]
unsafe fn test_mm512_permutexvar_epi64() {
let a = _mm512_setr_epi64(100, 200, 300, 400, 500, 600, 700, 800);
// Mirrors stdarch's basic sanity check.
let idx = _mm512_set1_epi64(1);
let r = _mm512_permutexvar_epi64(idx, a);
let e = _mm512_set1_epi64(200);
assert_eq_m512i(r, e);
// This must permute across the full 512-bit register, not within 128-bit lanes.
let idx = _mm512_setr_epi64(7, 0, 5, 2, 6, 1, 4, 3);
let r = _mm512_permutexvar_epi64(idx, a);
let e = _mm512_setr_epi64(800, 100, 600, 300, 700, 200, 500, 400);
assert_eq_m512i(r, e);
// Only the low 3 bits of each 64-bit index are used.
let idx = _mm512_setr_epi64(8, 15, -1, i64::MIN, 0, 1, 2, 3);
let r = _mm512_permutexvar_epi64(idx, a);
let e = _mm512_setr_epi64(100, 800, 800, 100, 100, 200, 300, 400);
assert_eq_m512i(r, e);
}
test_mm512_permutexvar_epi64();
#[target_feature(enable = "avx512bw")]
unsafe fn test_mm512_shuffle_epi8() {
#[rustfmt::skip]