Merge pull request #4962 from elichai/elichai/_mm512_permutexvar_epi64

Implement `_mm512_permutexvar_epi64` shim
2026-04-27 18:57:42 +03:00 · 2026-04-24 15:47:58 +00:00
parent ffc86e7eff bb9788bd42
commit d4ca795f50
3 changed files with 44 additions and 7 deletions
@@ -104,8 +104,8 @@ fn emulate_x86_avx512_intrinsic(

                pmaddbw(this, left, right, dest)?;
            }
-            // Used to implement the _mm512_permutexvar_epi32 function.
-            "permvar.si.512" => {
+            // Used to implement the _mm512_permutexvar_epi32/_mm512_permutexvar_epi64 functions.
+            "permvar.si.512" | "permvar.di.512" => {
                let [left, right] =
                    this.check_shim_sig_lenient(abi, CanonAbi::C, link_name, args)?;

@@ -1056,12 +1056,22 @@ fn pmaddbw<'tcx>(
    interp_ok(())
 }

-/// Shuffle 32-bit integers in `values` across lanes using the corresponding
-/// index in `indices`, and store the results in dst.
+/// Shuffle elements in `values` across lanes using the corresponding index in
+/// `indices`, and store the results in `dest`.
+///
+/// This helper is shared by both the 32-bit-lane and 64-bit-lane AVX
+/// permute-by-index intrinsics. The element type is taken from `values` and
+/// `dest`, while the index lanes are interpreted at their full width (`i32` or
+/// `i64`, depending on the intrinsic).
+///
+/// For a vector with `N` lanes, only the low `log2(N)` bits of each index are
+/// used. Equivalently, lane `i` of the result is copied from
+/// `values[indices[i] & (N - 1)]`.
 ///
 /// <https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_epi32>
 /// <https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_ps>
 /// <https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutexvar_epi32>
+/// <https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutexvar_epi64>
 fn permute<'tcx>(
    ecx: &mut crate::MiriInterpCx<'tcx>,
    values: &OpTy<'tcx>,
@@ -1075,18 +1085,21 @@ fn permute<'tcx>(
    // fn permd(a: u32x8, b: u32x8) -> u32x8;
    // fn permps(a: __m256, b: i32x8) -> __m256;
    // fn vpermd(a: i32x16, idx: i32x16) -> i32x16;
+    // fn vpermq(a: i64x8, b: i64x8) -> i64x8;
    assert_eq!(dest_len, values_len);
    assert_eq!(dest_len, indices_len);

    // Only use the lower 3 bits to index into a vector with 8 lanes,
    // or the lower 4 bits when indexing into a 16-lane vector.
    assert!(dest_len.is_power_of_two());
-    let mask = u32::try_from(dest_len).unwrap().strict_sub(1);
+    let mask = u128::from(dest_len).strict_sub(1);

    for i in 0..dest_len {
        let dest = ecx.project_index(&dest, i)?;
-        let index = ecx.read_scalar(&ecx.project_index(&indices, i)?)?.to_u32()?;
-        let element = ecx.project_index(&values, (index & mask).into())?;
+        let index_place = ecx.project_index(&indices, i)?;
+        let index = ecx.read_scalar(&index_place)?.to_uint(index_place.layout.size)?;
+        // `mask` is at most `dest_len - 1` which fits in a `u64`, so this cannot fail.
+        let element = ecx.project_index(&values, u64::try_from(index & mask).unwrap())?;

        ecx.copy_op(&element, &dest)?;
    }
@@ -219,6 +219,30 @@ unsafe fn test_mm512_permutexvar_epi32() {
    }
    test_mm512_permutexvar_epi32();

+    #[target_feature(enable = "avx512f")]
+    unsafe fn test_mm512_permutexvar_epi64() {
+        let a = _mm512_setr_epi64(100, 200, 300, 400, 500, 600, 700, 800);
+
+        // Mirrors stdarch's basic sanity check.
+        let idx = _mm512_set1_epi64(1);
+        let r = _mm512_permutexvar_epi64(idx, a);
+        let e = _mm512_set1_epi64(200);
+        assert_eq_m512i(r, e);
+
+        // This must permute across the full 512-bit register, not within 128-bit lanes.
+        let idx = _mm512_setr_epi64(7, 0, 5, 2, 6, 1, 4, 3);
+        let r = _mm512_permutexvar_epi64(idx, a);
+        let e = _mm512_setr_epi64(800, 100, 600, 300, 700, 200, 500, 400);
+        assert_eq_m512i(r, e);
+
+        // Only the low 3 bits of each 64-bit index are used.
+        let idx = _mm512_setr_epi64(8, 15, -1, i64::MIN, 0, 1, 2, 3);
+        let r = _mm512_permutexvar_epi64(idx, a);
+        let e = _mm512_setr_epi64(100, 800, 800, 100, 100, 200, 300, 400);
+        assert_eq_m512i(r, e);
+    }
+    test_mm512_permutexvar_epi64();
+
    #[target_feature(enable = "avx512bw")]
    unsafe fn test_mm512_shuffle_epi8() {
        #[rustfmt::skip]