From 1842e36d00820538e5c446dc8f2a3568bd6aed6e Mon Sep 17 00:00:00 2001 From: crypto-universe Date: Thu, 9 Nov 2017 14:15:07 +0100 Subject: [PATCH] [x86][sse4.1] Add phminposuw & pmul* instructions pmulld is implemented via multiplication. --- library/stdarch/src/x86/sse41.rs | 79 ++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/library/stdarch/src/x86/sse41.rs b/library/stdarch/src/x86/sse41.rs index 8c0099711946..aabb8fdb79e2 100644 --- a/library/stdarch/src/x86/sse41.rs +++ b/library/stdarch/src/x86/sse41.rs @@ -580,6 +580,39 @@ macro_rules! call { constify_imm4!(rounding, call) } +/// Find minimal u16 element in vector. +/// Place it in the first element of resulting vector and it's index +/// in second element (formally bits [16..18] inclusive). +/// All other elements are set to zero. +#[inline(always)] +#[target_feature = "+sse4.1"] +#[cfg_attr(test, assert_instr(phminposuw))] +pub unsafe fn _mm_minpos_epu16(a: u16x8) -> u16x8 { + phminposuw(a) +} + +/// Multiply the low 32-bit integers from each packed 64-bit element +/// in a and b, and store the signed 64-bit results in dst. +#[inline(always)] +#[target_feature = "+sse4.1"] +#[cfg_attr(test, assert_instr(pmuldq))] +pub unsafe fn _mm_mul_epi32(a: i32x4, b: i32x4) -> i64x2 { + pmuldq(a, b) +} + +/// Multiply the packed 32-bit integers in a and b, producing intermediate +/// 64-bit integers, and returns the lowest 32-bit, whatever they might be, +/// reinterpreted as a signed integer. +/// While pmulld i32x4::splat(2), i32x4::splat(2) returns the obvious +/// i32x4::splat(4), pmulld i32x4::splat(i32::MAX), i32x4::splat(2) +/// would return a negative number. +#[inline(always)] +#[target_feature = "+sse4.1"] +#[cfg_attr(test, assert_instr(pmulld))] +pub unsafe fn _mm_mullo_epi32(a: i32x4, b: i32x4) -> i32x4 { + a * b +} + #[allow(improper_ctypes)] extern "C" { @@ -627,6 +660,10 @@ macro_rules! call { fn roundsd(a: f64x2, b: f64x2, rounding: i32) -> f64x2; #[link_name = "llvm.x86.sse41.round.ss"] fn roundss(a: f32x4, b: f32x4, rounding: i32) -> f32x4; + #[link_name = "llvm.x86.sse41.phminposuw"] + fn phminposuw(a: u16x8) -> u16x8; + #[link_name = "llvm.x86.sse41.pmuldq"] + fn pmuldq(a: i32x4, b: i32x4) -> i64x2; } #[cfg(test)] @@ -1109,4 +1146,46 @@ unsafe fn _mm_round_ss() { let e = f32x4::new(-2.0, 3.5, 7.5, 15.5); assert_eq!(r, e); } + + #[simd_test = "sse4.1"] + unsafe fn _mm_minpos_epu16_1() { + let a = u16x8::new(23, 18, 44, 97, 50, 13, 67, 66); + let r = sse41::_mm_minpos_epu16(a); + let e = u16x8::new(13, 5, 0, 0, 0, 0, 0, 0); + assert_eq!(r, e); + } + + #[simd_test = "sse4.1"] + unsafe fn _mm_minpos_epu16_2() { + let a = u16x8::new(0, 18, 44, 97, 50, 13, 67, 66); + let r = sse41::_mm_minpos_epu16(a); + let e = u16x8::new(0, 0, 0, 0, 0, 0, 0, 0); + assert_eq!(r, e); + } + + #[simd_test = "sse4.1"] + unsafe fn _mm_mul_epi32() { + let a = + i32x4::new(15, 2 /* ignored */, 1234567, 4 /* ignored */); + let b = i32x4::new( + -20, + -256, /* ignored */ + 666666, + 666666, /* ignored */ + ); + let r = sse41::_mm_mul_epi32(a, b); + let e = i64x2::new(-300, 823043843622); + assert_eq!(r, e); + } + + #[simd_test = "sse4.1"] + unsafe fn _mm_mullo_epi32() { + let a = i32x4::new(15, -2, 1234567, 99999); + let b = i32x4::new(-20, -256, 666666, -99999); + let r = sse41::_mm_mullo_epi32(a, b); + // Attention, most significant bit in r[2] is treated as a sign bit! + // 1234567 * 666666 = -1589877210 + let e = i32x4::new(-300, 512, -1589877210, -1409865409); + assert_eq!(r, e); + } }