sse4.1 instructions (#98)

* sse4.1: _mm_blendv_ps and _mm_blendv_pd * sse4.1: _mm_blend_ps and _mm_blend_pd - HACK warning: messing with the constify macros - Selecting only one buffer gets optimized away and tests need to take this into account * sse4.1: _mm_blend_epi16 * sse4.1: _mm_extract_ps * sse4.1: _mm_extract_epi8 * see4.1: _mm_extract_epi32 * sse4.1: _mm_extract_epi64 * sse4.1: _mm_insert_ps * sse4.1: _mm_insert_epi8 * sse4.1: _mm_insert_epi32 and _mm_insert_epi64 * Formmating * sse4.1: _mm_max_epi8, _mm_max_epu16, _mm_max_epi32 and _mm_max_epu32 * Fix wrong compiler flag - avx -> sse4.1 * Fix intrinsics that only work with x86-64 * sse4.1: use appropriate types * Revert '_mm_extract_ps' to return i32 * sse4.1: Use the v128 types for consistency * Try fix for windows * Try "vectorcall" calling convention * Revert "Try "vectorcall" calling convention" This reverts commit 12936e9976bc6b0e4e538d82f55f0ee2d87a7f25. * Revert "Try fix for windows" This reverts commit 9c473808d334acedd46060b32ceea116662bf6a3. * Change tests for windows * Remove useless Windows test
2026-05-28 20:16:58 +03:00 · 2017-10-18 16:34:51 +01:00
parent acf919f960
commit 02c89b24ba
2 changed files with 398 additions and 7 deletions
@@ -301,3 +301,41 @@ macro_rules! constify_imm6 {
        }
    }
 }
+
+macro_rules! constify_imm4 {
+    ($imm8:expr, $expand:ident) => {
+        #[allow(overflowing_literals)]
+        match $imm8 & 0b1111 {
+            0 => $expand!(0),
+            1 => $expand!(1),
+            2 => $expand!(2),
+            3 => $expand!(3),
+            4 => $expand!(4),
+            5 => $expand!(5),
+            6 => $expand!(6),
+            7 => $expand!(7),
+            8 => $expand!(8),
+            9 => $expand!(9),
+            10 => $expand!(10),
+            11 => $expand!(11),
+            12 => $expand!(12),
+            13 => $expand!(13),
+            14 => $expand!(14),
+            _ => $expand!(15),
+        }
+    }
+}
+
+macro_rules! constify_imm2 {
+    ($imm8:expr, $expand:ident) => {
+        #[allow(overflowing_literals)]
+        match $imm8 & 0b11 {
+            0 => $expand!(0),
+            1 => $expand!(1),
+            2 => $expand!(2),
+            _ => $expand!(3),
+        }
+    }
+}
+
+
@@ -1,20 +1,191 @@
+
+use std::mem;
+
 #[cfg(test)]
 use stdsimd_test::assert_instr;

 use v128::*;
-use x86::__m128i;

 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pblendvb))]
-pub unsafe fn _mm_blendv_epi8(
-    a: __m128i,
-    b: __m128i,
-    mask: __m128i,
-) -> __m128i {
+pub unsafe fn _mm_blendv_epi8(a: i8x16, b: i8x16, mask: i8x16) -> i8x16 {
    pblendvb(a, b, mask)
 }

+#[inline(always)]
+#[target_feature = "+sse4.1"]
+#[cfg_attr(test, assert_instr(pblendw, imm8=0xF0))]
+pub unsafe fn _mm_blend_epi16(a: i16x8, b: i16x8, imm8: u8) -> i16x8 {
+    macro_rules! call {
+        ($imm8:expr) => { pblendw(a, b, $imm8) }
+    }
+    constify_imm8!(imm8, call)
+}
+
+/// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using `mask`
+#[inline(always)]
+#[target_feature = "+sse4.1"]
+#[cfg_attr(test, assert_instr(blendvpd))]
+pub unsafe fn _mm_blendv_pd(a: f64x2, b: f64x2, mask: f64x2) -> f64x2 {
+    blendvpd(a, b, mask)
+}
+
+/// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using `mask`
+#[inline(always)]
+#[target_feature = "+sse4.1"]
+#[cfg_attr(test, assert_instr(blendvps))]
+pub unsafe fn _mm_blendv_ps(a: f32x4, b: f32x4, mask: f32x4) -> f32x4 {
+    blendvps(a, b, mask)
+}
+
+/// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using control mask `imm2`
+#[inline(always)]
+#[target_feature = "+sse4.1"]
+#[cfg_attr(test, assert_instr(blendpd, imm2=0b10))]
+pub unsafe fn _mm_blend_pd(a: f64x2, b: f64x2, imm2: u8) -> f64x2 {
+    macro_rules! call {
+        ($imm2:expr) => { blendpd(a, b, $imm2) }
+    }
+    constify_imm2!(imm2, call)
+}
+
+/// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using mask `imm4`
+#[inline(always)]
+#[target_feature = "+sse4.1"]
+#[cfg_attr(test, assert_instr(blendps, imm4=0b0101))]
+pub unsafe fn _mm_blend_ps(a: f32x4, b: f32x4, imm4: u8) -> f32x4 {
+    macro_rules! call {
+        ($imm4:expr) => { blendps(a, b, $imm4) }
+    }
+    constify_imm4!(imm4, call)
+}
+
+/// Extract a single-precision (32-bit) floating-point element from `a`, selected with `imm8`
+#[inline(always)]
+#[target_feature = "+sse4.1"]
+// TODO: Add test for Windows
+#[cfg_attr(all(test, not(windows)), assert_instr(extractps, imm8=0))]
+pub unsafe fn _mm_extract_ps(a: f32x4, imm8: u8) -> i32 {
+    mem::transmute(a.extract(imm8 as u32 & 0b11))
+}
+
+/// Extract an 8-bit integer from `a` selected with `imm8`
+#[inline(always)]
+#[target_feature = "+sse4.1"]
+#[cfg_attr(test, assert_instr(pextrb, imm8=0))]
+pub unsafe fn _mm_extract_epi8(a: i8x16, imm8: u8) -> i8 {
+    a.extract((imm8 & 0b1111) as u32)
+}
+
+/// Extract an 32-bit integer from `a` selected with `imm8`
+#[inline(always)]
+#[target_feature = "+sse4.1"]
+// TODO: Add test for Windows
+#[cfg_attr(all(test, not(windows)), assert_instr(pextrd, imm8=1))]
+pub unsafe fn _mm_extract_epi32(a: i32x4, imm8: u8) -> i32 {
+    a.extract((imm8 & 0b11) as u32)
+}
+
+/// Extract an 64-bit integer from `a` selected with `imm8`
+#[cfg(target_arch = "x86_64")]
+#[inline(always)]
+#[target_feature = "+sse4.1"]
+// TODO: Add test for Windows
+#[cfg_attr(all(test, not(windows)), assert_instr(pextrq, imm8=1))]
+pub unsafe fn _mm_extract_epi64(a: i64x2, imm8: u8) -> i64 {
+    a.extract((imm8 & 0b1) as u32)
+}
+
+/// Select a single value in `a` to store at some position in `b`, 
+/// Then zero elements according to `imm8`.
+/// 
+/// `imm8` specifies which bits from operand `a` will be copied, which bits in the 
+/// result they will be copied to, and which bits in the result will be
+/// cleared. The following assignments are made:
+///
+/// * Bits `[7:6]` specify the bits to copy from operand `a`:
+///     - `00`: Selects bits `[31:0]` from operand `a`.
+///     - `01`: Selects bits `[63:32]` from operand `a`.
+///     - `10`: Selects bits `[95:64]` from operand `a`.
+///     - `11`: Selects bits `[127:96]` from operand `a`.
+///
+/// * Bits `[5:4]` specify the bits in the result to which the selected bits
+/// from operand `a` are copied:
+///     - `00`: Copies the selected bits from `a` to result bits `[31:0]`.
+///     - `01`: Copies the selected bits from `a` to result bits `[63:32]`.
+///     - `10`: Copies the selected bits from `a` to result bits `[95:64]`.
+///     - `11`: Copies the selected bits from `a` to result bits `[127:96]`.
+///
+/// * Bits `[3:0]`: If any of these bits are set, the corresponding result
+/// element is cleared.
+#[inline(always)]
+#[target_feature = "+sse4.1"]
+#[cfg_attr(test, assert_instr(insertps, imm8=0b1010))]
+pub unsafe fn _mm_insert_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 {
+    macro_rules! call {
+        ($imm8:expr) => { insertps(a, b, $imm8) }
+    }
+    constify_imm8!(imm8, call)
+}
+
+/// Return a copy of `a` with the 8-bit integer from `i` inserted at a location specified by `imm8`. 
+#[inline(always)]
+#[target_feature = "+sse4.1"]
+#[cfg_attr(test, assert_instr(pinsrb, imm8=0))]
+pub unsafe fn _mm_insert_epi8(a: i8x16, i: i8, imm8: u8) -> i8x16 {
+    a.replace((imm8 & 0b1111) as u32, i)
+}
+
+/// Return a copy of `a` with the 32-bit integer from `i` inserted at a location specified by `imm8`. 
+#[inline(always)]
+#[target_feature = "+sse4.1"]
+#[cfg_attr(test, assert_instr(pinsrd, imm8=0))]
+pub unsafe fn _mm_insert_epi32(a: i32x4, i: i32, imm8: u8) -> i32x4 {
+    a.replace((imm8 & 0b11) as u32, i)
+}
+
+/// Return a copy of `a` with the 64-bit integer from `i` inserted at a location specified by `imm8`. 
+#[cfg(target_arch = "x86_64")]
+#[inline(always)]
+#[target_feature = "+sse4.1"]
+#[cfg_attr(test, assert_instr(pinsrq, imm8=0))]
+pub unsafe fn _mm_insert_epi64(a: i64x2, i: i64, imm8: u8) -> i64x2 {
+    a.replace((imm8 & 0b1) as u32, i)
+}
+
+/// Compare packed 8-bit integers in `a` and `b`,87 and return packed maximum values in dst. 
+#[inline(always)]
+#[target_feature = "+sse4.1"]
+#[cfg_attr(test, assert_instr(pmaxsb, imm8=0))]
+pub unsafe fn _mm_max_epi8(a: i8x16, b: i8x16) -> i8x16 {
+    pmaxsb(a, b)
+}
+
+/// Compare packed unsigned 16-bit integers in `a` and `b`, and return packed maximum.
+#[inline(always)]
+#[target_feature = "+sse4.1"]
+#[cfg_attr(test, assert_instr(pmaxuw, imm8=0))]
+pub unsafe fn _mm_max_epu16(a: u16x8, b: u16x8) -> u16x8 {
+    pmaxuw(a, b)
+}
+
+// Compare packed 32-bit integers in `a` and `b`, and return packed maximum values.
+#[inline(always)]
+#[target_feature = "+sse4.1"]
+#[cfg_attr(test, assert_instr(pmaxsd, imm8=0))]
+pub unsafe fn _mm_max_epi32(a: i32x4, b: i32x4) -> i32x4 {
+    pmaxsd(a, b)
+}
+
+// Compare packed unsigned 32-bit integers in `a` and `b`, and return packed maximum values.
+#[inline(always)]
+#[target_feature = "+sse4.1"]
+#[cfg_attr(test, assert_instr(pmaxud, imm8=0))]
+pub unsafe fn _mm_max_epu32(a: u32x4, b: u32x4) -> u32x4 {
+    pmaxud(a, b)
+}
+
 /// Returns the dot product of two f64x2 vectors.
 ///
 /// `imm8[1:0]` is the broadcast mask, and `imm8[5:4]` is the condition mask.
@@ -52,7 +223,27 @@ macro_rules! call {
 #[allow(improper_ctypes)]
 extern {
    #[link_name = "llvm.x86.sse41.pblendvb"]
-    fn pblendvb(a: __m128i, b: __m128i, mask: __m128i) -> __m128i;
+    fn pblendvb(a: i8x16, b: i8x16, mask: i8x16) -> i8x16;
+    #[link_name = "llvm.x86.sse41.blendvpd"]
+    fn blendvpd(a: f64x2, b: f64x2, mask: f64x2) -> f64x2;
+    #[link_name = "llvm.x86.sse41.blendvps"]
+    fn blendvps(a: f32x4, b: f32x4, mask: f32x4) -> f32x4;
+    #[link_name = "llvm.x86.sse41.blendpd"]
+    fn blendpd(a: f64x2, b: f64x2, imm2: u8) -> f64x2;
+    #[link_name = "llvm.x86.sse41.blendps"]
+    fn blendps(a: f32x4, b: f32x4, imm4: u8) -> f32x4;
+    #[link_name = "llvm.x86.sse41.pblendw"]
+    fn pblendw(a: i16x8, b: i16x8, imm8: u8) -> i16x8;
+    #[link_name = "llvm.x86.sse41.insertps"]
+    fn insertps(a: f32x4, b: f32x4, imm8: u8) -> f32x4;
+    #[link_name = "llvm.x86.sse41.pmaxsb"]
+    fn pmaxsb(a: i8x16, b: i8x16) -> i8x16;
+    #[link_name = "llvm.x86.sse41.pmaxuw"]
+    fn pmaxuw(a: u16x8, b: u16x8) -> u16x8;
+    #[link_name = "llvm.x86.sse41.pmaxsd"]
+    fn pmaxsd(a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.sse41.pmaxud"]
+    fn pmaxud(a: u32x4, b: u32x4) -> u32x4;
    #[link_name = "llvm.x86.sse41.dppd"]
    fn dppd(a: f64x2, b: f64x2, imm8: u8) -> f64x2;
    #[link_name = "llvm.x86.sse41.dpps"]
@@ -61,6 +252,8 @@ macro_rules! call {

 #[cfg(test)]
 mod tests {
+    use std::mem;
+
    use stdsimd_test::simd_test;

    use v128::*;
@@ -79,6 +272,166 @@ unsafe fn _mm_blendv_epi8() {
        assert_eq!(sse41::_mm_blendv_epi8(a, b, mask), e);
    }

+    #[simd_test = "sse4.1"]
+    unsafe fn _mm_blendv_pd() {
+        let a = f64x2::splat(0.0);
+        let b = f64x2::splat(1.0);
+        let mask = mem::transmute(i64x2::new(0, -1));
+        let r = sse41::_mm_blendv_pd(a, b, mask);
+        let e = f64x2::new(0.0, 1.0);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "sse4.1"]
+    unsafe fn _mm_blendv_ps() {
+        let a = f32x4::splat(0.0);
+        let b = f32x4::splat(1.0);
+        let mask = mem::transmute(i32x4::new(0,-1, 0, -1));
+        let r = sse41::_mm_blendv_ps(a, b, mask);
+        let e = f32x4::new(0.0, 1.0, 0.0, 1.0);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "sse4.1"]
+    unsafe fn _mm_blend_pd() {
+        let a = f64x2::splat(0.0);
+        let b = f64x2::splat(1.0);
+        let r = sse41::_mm_blend_pd(a, b, 0b10);
+        let e = f64x2::new(0.0, 1.0);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "sse4.1"]
+    unsafe fn _mm_blend_ps() {
+        let a = f32x4::splat(0.0);
+        let b = f32x4::splat(1.0);
+        let r = sse41::_mm_blend_ps(a, b, 0b1010);
+        let e = f32x4::new(0.0, 1.0, 0.0, 1.0);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "sse4.1"]
+    unsafe fn _mm_blend_epi16() {
+        let a = i16x8::splat(0);
+        let b = i16x8::splat(1);
+        let r = sse41::_mm_blend_epi16(a, b, 0b1010_1100);
+        let e = i16x8::new(0, 0, 1, 1, 0, 1, 0, 1);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "sse4.1"]
+    unsafe fn _mm_extract_ps() {
+        let a = f32x4::new(0.0, 1.0, 2.0, 3.0);
+        let r: f32 = mem::transmute(sse41::_mm_extract_ps(a, 1));
+        assert_eq!(r, 1.0);
+        let r: f32 = mem::transmute(sse41::_mm_extract_ps(a, 5));
+        assert_eq!(r, 1.0);
+    }
+
+    #[simd_test = "sse4.1"]
+    unsafe fn _mm_extract_epi8() {
+        let a = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = sse41::_mm_extract_epi8(a, 1);
+        assert_eq!(r, 1);
+        let r = sse41::_mm_extract_epi8(a, 17);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test = "sse4.1"]
+    unsafe fn _mm_extract_epi32() {
+        let a = i32x4::new(0, 1, 2, 3);
+        let r = sse41::_mm_extract_epi32(a, 1);
+        assert_eq!(r, 1);
+        let r = sse41::_mm_extract_epi32(a, 5);
+        assert_eq!(r, 1);
+    }
+
+    #[cfg(target_arch = "x86_64")]
+    #[simd_test = "sse4.1"]
+    unsafe fn _mm_extract_epi64() {
+        let a = i64x2::new(0, 1);
+        let r = sse41::_mm_extract_epi64(a, 1);
+        assert_eq!(r, 1);
+        let r = sse41::_mm_extract_epi64(a, 3);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test = "sse4.1"]
+    unsafe fn _mm_insert_ps() {
+        let a = f32x4::splat(1.0);
+        let b = f32x4::new(1.0, 2.0, 3.0, 4.0);
+        let r = sse41::_mm_insert_ps(a, b, 0b11_00_1100);
+        let e = f32x4::new(4.0, 1.0, 0.0, 0.0);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "sse4.1"]
+    unsafe fn _mm_insert_epi8() {
+        let a = i8x16::splat(0);
+        let e = i8x16::splat(0).replace(1, 32);
+        let r = sse41::_mm_insert_epi8(a, 32, 1);
+        assert_eq!(r, e);
+        let r = sse41::_mm_insert_epi8(a, 32, 17);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "sse4.1"]
+    unsafe fn _mm_insert_epi32() {
+        let a = i32x4::splat(0);
+        let e = i32x4::splat(0).replace(1, 32);
+        let r = sse41::_mm_insert_epi32(a, 32, 1);
+        assert_eq!(r, e);
+        let r = sse41::_mm_insert_epi32(a, 32, 5);
+        assert_eq!(r, e);
+    }
+
+    #[cfg(target_arch = "x86_64")]
+    #[simd_test = "sse4.1"]
+    unsafe fn _mm_insert_epi64() {
+        let a = i64x2::splat(0);
+        let e = i64x2::splat(0).replace(1, 32);
+        let r = sse41::_mm_insert_epi64(a, 32, 1);
+        assert_eq!(r, e);
+        let r = sse41::_mm_insert_epi64(a, 32, 3);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "sse4.1"]
+    unsafe fn _mm_max_epi8() {
+        let a = i8x16::new(1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29, 32);
+        let b = i8x16::new(2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31);
+        let r = sse41::_mm_max_epi8(a, b);
+        let e = i8x16::new(2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "sse4.1"]
+    unsafe fn _mm_max_epu16() {
+        let a = u16x8::new(1, 4, 5, 8, 9, 12, 13, 16);
+        let b = u16x8::new(2, 3, 6, 7, 10, 11, 14, 15);
+        let r = sse41::_mm_max_epu16(a, b);
+        let e = u16x8::new(2, 4, 6, 8, 10, 12, 14, 16);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "sse4.1"]
+    unsafe fn _mm_max_epi32() {
+        let a = i32x4::new(1, 4, 5, 8);
+        let b = i32x4::new(2, 3, 6, 7);
+        let r = sse41::_mm_max_epi32(a, b);
+        let e = i32x4::new(2, 4, 6, 8);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "sse4.1"]
+    unsafe fn _mm_max_epu32() {
+        let a = u32x4::new(1, 4, 5, 8);
+        let b = u32x4::new(2, 3, 6, 7);
+        let r = sse41::_mm_max_epu32(a, b);
+        let e = u32x4::new(2, 4, 6, 8);
+        assert_eq!(r, e);
+    }
+
    #[simd_test = "sse4.1"]
    unsafe fn _mm_dp_pd() {
        let a = f64x2::new(2.0, 3.0);