From 6fa33bea62865ebbfb58b88f689e4210b0769c1f Mon Sep 17 00:00:00 2001 From: okaneco <47607823+okaneco@users.noreply.github.com> Date: Thu, 19 Feb 2026 07:05:07 -0500 Subject: [PATCH] Use intrinsics for `sse41` --- .../stdarch/crates/core_arch/src/x86/sse41.rs | 23 +++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/library/stdarch/crates/core_arch/src/x86/sse41.rs b/library/stdarch/crates/core_arch/src/x86/sse41.rs index 7ad4306f36f2..8036f24e24d3 100644 --- a/library/stdarch/crates/core_arch/src/x86/sse41.rs +++ b/library/stdarch/crates/core_arch/src/x86/sse41.rs @@ -418,7 +418,7 @@ pub const fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i { unsafe { simd_imin(a.as_u32x4(), b.as_u32x4()).as_m128i() } } -/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers +/// Converts packed signed 32-bit integers from `a` and `b` to packed 16-bit integers /// using unsigned saturation /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32) @@ -427,7 +427,24 @@ pub const fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(packusdw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i { - unsafe { transmute(packusdw(a.as_i32x4(), b.as_i32x4())) } + unsafe { + let max = simd_splat(i32::from(u16::MAX)); + let min = simd_splat(i32::from(u16::MIN)); + + let clamped_a = simd_imax(simd_imin(a.as_i32x4(), max), min) + .as_m128i() + .as_i16x8(); + let clamped_b = simd_imax(simd_imin(b.as_i32x4(), max), min) + .as_m128i() + .as_i16x8(); + + // Shuffle the low u16 of each i32 from two concatenated vectors into + // the low bits of the result register. + const IDXS: [u32; 8] = [0, 2, 4, 6, 8, 10, 12, 14]; + let result: i16x8 = simd_shuffle!(clamped_a, clamped_b, IDXS); + + result.as_m128i() + } } /// Compares packed 64-bit integers in `a` and `b` for equality @@ -1166,8 +1183,6 @@ pub unsafe fn _mm_stream_load_si128(mem_addr: *const __m128i) -> __m128i { unsafe extern "C" { #[link_name = "llvm.x86.sse41.insertps"] fn insertps(a: __m128, b: __m128, imm8: u8) -> __m128; - #[link_name = "llvm.x86.sse41.packusdw"] - fn packusdw(a: i32x4, b: i32x4) -> u16x8; #[link_name = "llvm.x86.sse41.dppd"] fn dppd(a: __m128d, b: __m128d, imm8: u8) -> __m128d; #[link_name = "llvm.x86.sse41.dpps"]