Use intrinsics for sse41

This commit is contained in:
okaneco
2026-02-19 07:05:07 -05:00
parent a5ba7941f9
commit 6fa33bea62
@@ -418,7 +418,7 @@ pub const fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i {
unsafe { simd_imin(a.as_u32x4(), b.as_u32x4()).as_m128i() }
}
/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
/// Converts packed signed 32-bit integers from `a` and `b` to packed 16-bit integers
/// using unsigned saturation
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32)
@@ -427,7 +427,24 @@ pub const fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(packusdw))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i {
unsafe { transmute(packusdw(a.as_i32x4(), b.as_i32x4())) }
unsafe {
let max = simd_splat(i32::from(u16::MAX));
let min = simd_splat(i32::from(u16::MIN));
let clamped_a = simd_imax(simd_imin(a.as_i32x4(), max), min)
.as_m128i()
.as_i16x8();
let clamped_b = simd_imax(simd_imin(b.as_i32x4(), max), min)
.as_m128i()
.as_i16x8();
// Shuffle the low u16 of each i32 from two concatenated vectors into
// the low bits of the result register.
const IDXS: [u32; 8] = [0, 2, 4, 6, 8, 10, 12, 14];
let result: i16x8 = simd_shuffle!(clamped_a, clamped_b, IDXS);
result.as_m128i()
}
}
/// Compares packed 64-bit integers in `a` and `b` for equality
@@ -1166,8 +1183,6 @@ pub unsafe fn _mm_stream_load_si128(mem_addr: *const __m128i) -> __m128i {
unsafe extern "C" {
#[link_name = "llvm.x86.sse41.insertps"]
fn insertps(a: __m128, b: __m128, imm8: u8) -> __m128;
#[link_name = "llvm.x86.sse41.packusdw"]
fn packusdw(a: i32x4, b: i32x4) -> u16x8;
#[link_name = "llvm.x86.sse41.dppd"]
fn dppd(a: __m128d, b: __m128d, imm8: u8) -> __m128d;
#[link_name = "llvm.x86.sse41.dpps"]