Use intrinsics for sse41

2026-04-27 18:57:42 +03:00 · 2026-02-19 07:05:07 -05:00
parent a5ba7941f9
commit 6fa33bea62
1 changed files with 19 additions and 4 deletions
@@ -418,7 +418,7 @@ pub const fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i {
    unsafe { simd_imin(a.as_u32x4(), b.as_u32x4()).as_m128i() }
 }

-/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
+/// Converts packed signed 32-bit integers from `a` and `b` to packed 16-bit integers
 /// using unsigned saturation
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32)
@@ -427,7 +427,24 @@ pub const fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(packusdw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(packusdw(a.as_i32x4(), b.as_i32x4())) }
+    unsafe {
+        let max = simd_splat(i32::from(u16::MAX));
+        let min = simd_splat(i32::from(u16::MIN));
+
+        let clamped_a = simd_imax(simd_imin(a.as_i32x4(), max), min)
+            .as_m128i()
+            .as_i16x8();
+        let clamped_b = simd_imax(simd_imin(b.as_i32x4(), max), min)
+            .as_m128i()
+            .as_i16x8();
+
+        // Shuffle the low u16 of each i32 from two concatenated vectors into
+        // the low bits of the result register.
+        const IDXS: [u32; 8] = [0, 2, 4, 6, 8, 10, 12, 14];
+        let result: i16x8 = simd_shuffle!(clamped_a, clamped_b, IDXS);
+
+        result.as_m128i()
+    }
 }

 /// Compares packed 64-bit integers in `a` and `b` for equality
@@ -1166,8 +1183,6 @@ pub unsafe fn _mm_stream_load_si128(mem_addr: *const __m128i) -> __m128i {
 unsafe extern "C" {
    #[link_name = "llvm.x86.sse41.insertps"]
    fn insertps(a: __m128, b: __m128, imm8: u8) -> __m128;
-    #[link_name = "llvm.x86.sse41.packusdw"]
-    fn packusdw(a: i32x4, b: i32x4) -> u16x8;
    #[link_name = "llvm.x86.sse41.dppd"]
    fn dppd(a: __m128d, b: __m128d, imm8: u8) -> __m128d;
    #[link_name = "llvm.x86.sse41.dpps"]