From c5b220e1de013b5cdac52d9820f5f55b40401ab7 Mon Sep 17 00:00:00 2001 From: Sayantan Chakraborty <142906350+sayantn@users.noreply.github.com> Date: Fri, 2 Jan 2026 19:35:29 +0000 Subject: [PATCH] Merge pull request #1985 from usamoi/vpmaddwd Use LLVM intrinsics for `madd` intrinsics (cherry picked from commit 85f3ba3dd1d4d510c9ce1c6f7aed58fcb53a448c) --- .../stdarch/crates/core_arch/src/x86/avx2.rs | 21 ++++++++++---- .../crates/core_arch/src/x86/avx512bw.rs | 29 ++++++++++--------- .../stdarch/crates/core_arch/src/x86/sse2.rs | 21 ++++++++++---- 3 files changed, 45 insertions(+), 26 deletions(-) diff --git a/library/stdarch/crates/core_arch/src/x86/avx2.rs b/library/stdarch/crates/core_arch/src/x86/avx2.rs index 8be302cabc77..e22e39fca882 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx2.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx2.rs @@ -1754,12 +1754,19 @@ pub fn _mm256_inserti128_si256(a: __m256i, b: __m128i) -> __m25 #[cfg_attr(test, assert_instr(vpmaddwd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i { - unsafe { - let r: i32x16 = simd_mul(simd_cast(a.as_i16x16()), simd_cast(b.as_i16x16())); - let even: i32x8 = simd_shuffle!(r, r, [0, 2, 4, 6, 8, 10, 12, 14]); - let odd: i32x8 = simd_shuffle!(r, r, [1, 3, 5, 7, 9, 11, 13, 15]); - simd_add(even, odd).as_m256i() - } + // It's a trick used in the Adler-32 algorithm to perform a widening addition. + // + // ```rust + // #[target_feature(enable = "avx2")] + // unsafe fn widening_add(mad: __m256i) -> __m256i { + // _mm256_madd_epi16(mad, _mm256_set1_epi16(1)) + // } + // ``` + // + // If we implement this using generic vector intrinsics, the optimizer + // will eliminate this pattern, and `vpmaddwd` will no longer be emitted. + // For this reason, we use x86 intrinsics. + unsafe { transmute(pmaddwd(a.as_i16x16(), b.as_i16x16())) } } /// Vertically multiplies each unsigned 8-bit integer from `a` with the @@ -3701,6 +3708,8 @@ pub fn _mm256_extract_epi16(a: __m256i) -> i32 { fn phaddsw(a: i16x16, b: i16x16) -> i16x16; #[link_name = "llvm.x86.avx2.phsub.sw"] fn phsubsw(a: i16x16, b: i16x16) -> i16x16; + #[link_name = "llvm.x86.avx2.pmadd.wd"] + fn pmaddwd(a: i16x16, b: i16x16) -> i32x8; #[link_name = "llvm.x86.avx2.pmadd.ub.sw"] fn pmaddubsw(a: u8x32, b: i8x32) -> i16x16; #[link_name = "llvm.x86.avx2.mpsadbw"] diff --git a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs index 0e2dd3ad4068..d6cc0921750b 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs @@ -5847,20 +5847,19 @@ pub unsafe fn _mm_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask16, a: __m128 #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpmaddwd))] pub fn _mm512_madd_epi16(a: __m512i, b: __m512i) -> __m512i { - unsafe { - let r: i32x32 = simd_mul(simd_cast(a.as_i16x32()), simd_cast(b.as_i16x32())); - let even: i32x16 = simd_shuffle!( - r, - r, - [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30] - ); - let odd: i32x16 = simd_shuffle!( - r, - r, - [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31] - ); - simd_add(even, odd).as_m512i() - } + // It's a trick used in the Adler-32 algorithm to perform a widening addition. + // + // ```rust + // #[target_feature(enable = "avx512bw")] + // unsafe fn widening_add(mad: __m512i) -> __m512i { + // _mm512_madd_epi16(mad, _mm512_set1_epi16(1)) + // } + // ``` + // + // If we implement this using generic vector intrinsics, the optimizer + // will eliminate this pattern, and `vpmaddwd` will no longer be emitted. + // For this reason, we use x86 intrinsics. + unsafe { transmute(vpmaddwd(a.as_i16x32(), b.as_i16x32())) } } /// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -11687,6 +11686,8 @@ pub unsafe fn _mm_mask_cvtusepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: #[link_name = "llvm.x86.avx512.pmul.hr.sw.512"] fn vpmulhrsw(a: i16x32, b: i16x32) -> i16x32; + #[link_name = "llvm.x86.avx512.pmaddw.d.512"] + fn vpmaddwd(a: i16x32, b: i16x32) -> i32x16; #[link_name = "llvm.x86.avx512.pmaddubs.w.512"] fn vpmaddubsw(a: u8x64, b: i8x64) -> i16x32; diff --git a/library/stdarch/crates/core_arch/src/x86/sse2.rs b/library/stdarch/crates/core_arch/src/x86/sse2.rs index 11335856fb22..c5b051bfb69d 100644 --- a/library/stdarch/crates/core_arch/src/x86/sse2.rs +++ b/library/stdarch/crates/core_arch/src/x86/sse2.rs @@ -201,12 +201,19 @@ pub fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(pmaddwd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i { - unsafe { - let r: i32x8 = simd_mul(simd_cast(a.as_i16x8()), simd_cast(b.as_i16x8())); - let even: i32x4 = simd_shuffle!(r, r, [0, 2, 4, 6]); - let odd: i32x4 = simd_shuffle!(r, r, [1, 3, 5, 7]); - simd_add(even, odd).as_m128i() - } + // It's a trick used in the Adler-32 algorithm to perform a widening addition. + // + // ```rust + // #[target_feature(enable = "sse2")] + // unsafe fn widening_add(mad: __m128i) -> __m128i { + // _mm_madd_epi16(mad, _mm_set1_epi16(1)) + // } + // ``` + // + // If we implement this using generic vector intrinsics, the optimizer + // will eliminate this pattern, and `pmaddwd` will no longer be emitted. + // For this reason, we use x86 intrinsics. + unsafe { transmute(pmaddwd(a.as_i16x8(), b.as_i16x8())) } } /// Compares packed 16-bit integers in `a` and `b`, and returns the packed @@ -3054,6 +3061,8 @@ pub fn _mm_unpacklo_pd(a: __m128d, b: __m128d) -> __m128d { fn lfence(); #[link_name = "llvm.x86.sse2.mfence"] fn mfence(); + #[link_name = "llvm.x86.sse2.pmadd.wd"] + fn pmaddwd(a: i16x8, b: i16x8) -> i32x4; #[link_name = "llvm.x86.sse2.psad.bw"] fn psadbw(a: u8x16, b: u8x16) -> u64x2; #[link_name = "llvm.x86.sse2.psll.w"]