From 40a0b1cc924ea541452a263ca47148f1047efb7f Mon Sep 17 00:00:00 2001
From: Tony Sifkarovski <governortarkin@users.noreply.github.com>
Date: Sun, 26 Nov 2017 11:40:26 -0500
Subject: [PATCH] [avx2] add shuffle, insert/extract i128, permute* (#210)

* [x86][avx2] add _mm256_shuffle{hi,lo}_epi16
* [x86][avx2] add _mm256_{insert,extract}i128_si256
* [x86][avx2] add remaining permute intrinsics
---
 library/stdarch/coresimd/src/x86/i586/avx2.rs | 304 +++++++++++++++++-
 1 file changed, 295 insertions(+), 9 deletions(-)

diff --git a/library/stdarch/coresimd/src/x86/i586/avx2.rs b/library/stdarch/coresimd/src/x86/i586/avx2.rs
index 7bf376b95f1e..81f836f15466 100644
--- a/library/stdarch/coresimd/src/x86/i586/avx2.rs
+++ b/library/stdarch/coresimd/src/x86/i586/avx2.rs
@@ -24,7 +24,7 @@
 
 use v256::*;
 use v128::*;
-use x86::__m256i;
+use x86::{__m128i, __m256i};
 
 #[cfg(test)]
 use stdsimd_test::assert_instr;
@@ -643,7 +643,20 @@ pub unsafe fn _mm256_cvtepu8_epi64(a: u8x16) -> i64x4 {
     simd_cast::<::v32::u8x4, _>(simd_shuffle4(a, a, [0, 1, 2, 3]))
 }
 
-// TODO _m128i _mm256_extracti128_si256
+/// Extract 128 bits (of integer data) from `a` selected with `imm8`.
+#[inline(always)]
+#[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vextractf128, imm8 = 1))]
+pub unsafe fn _mm256_extracti128_si256(a: __m256i, imm8: i32) -> __m128i {
+    use x86::i586::avx::_mm256_undefined_si256;
+    let imm8 = (imm8 & 0xFF) as u8;
+    let b = i64x4::from(_mm256_undefined_si256());
+    let dst: i64x2 = match imm8 & 0b01 {
+        0 => simd_shuffle2(i64x4::from(a), b, [0, 1]),
+        _ => simd_shuffle2(i64x4::from(a), b, [2, 3]),
+    };
+    __m128i::from(dst)
+}
 
 /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`.
 #[inline(always)]
@@ -1191,7 +1204,23 @@ macro_rules! call {
     constify_imm8!(scale, call)
 }
 
-// TODO _mm256_inserti128_si256
+/// Copy `a` to `dst`, then insert 128 bits (of integer data) from `b` at the
+/// location specified by `imm8`.
+#[inline(always)]
+#[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vinsertf128, imm8 = 1))]
+pub unsafe fn _mm256_inserti128_si256(
+    a: __m256i, b: __m128i, imm8: i32
+) -> __m256i {
+    use x86::i586::avx::_mm256_castsi128_si256;
+    let imm8 = (imm8 & 0b01) as u8;
+    let b = i64x4::from(_mm256_castsi128_si256(b));
+    let dst: i64x4 = match imm8 & 0b01 {
+        0 => simd_shuffle4(i64x4::from(a), b, [4, 5, 2, 3]),
+        _ => simd_shuffle4(i64x4::from(a), b, [0, 1, 4, 5]),
+    };
+    __m256i::from(dst)
+}
 
 /// Multiply packed signed 16-bit integers in `a` and `b`, producing
 /// intermediate signed 32-bit integers. Horizontally add adjacent pairs
@@ -1616,9 +1645,80 @@ macro_rules! permute1 {
     }
 }
 
-// TODO _mm256_permute2x128_si256 (__m256i a, __m256i b, const int imm8)
-// TODO _mm256_permute4x64_pd (__m256d a, const int imm8)
-// TODO _mm256_permutevar8x32_ps (__m256 a, __m256i idx)
+/// Shuffle 128-bits of integer data selected by `imm8` from `a` and `b`.
+#[inline(always)]
+#[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vperm2f128, imm8 = 9))]
+pub unsafe fn _mm256_permute2x128_si256(
+    a: __m256i, b: __m256i, imm8: i32
+) -> __m256i {
+    macro_rules! call {
+        ($imm8:expr) => {
+            __m256i::from(vperm2i128(i64x4::from(a), i64x4::from(b), $imm8))
+        }
+    }
+    constify_imm8!(imm8, call)
+}
+
+/// Shuffle 64-bit floating-point elements in `a` across lanes using the
+/// control in `imm8`.
+#[inline(always)]
+#[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpermpd, imm8 = 1))]
+pub unsafe fn _mm256_permute4x64_pd(a: f64x4, imm8: i32) -> f64x4 {
+    use x86::i586::avx::_mm256_undefined_pd;
+    let imm8 = (imm8 & 0xFF) as u8;
+    macro_rules! shuffle_done {
+        ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
+            simd_shuffle4(a, _mm256_undefined_pd(), [$x01, $x23, $x45, $x67])
+        }
+    }
+    macro_rules! shuffle_x67 {
+        ($x01:expr, $x23:expr, $x45:expr) => {
+            match (imm8 >> 6) & 0b11 {
+                0b00 => shuffle_done!($x01, $x23, $x45, 0),
+                0b01 => shuffle_done!($x01, $x23, $x45, 1),
+                0b10 => shuffle_done!($x01, $x23, $x45, 2),
+                _ => shuffle_done!($x01, $x23, $x45, 3),
+            }
+        }
+    }
+    macro_rules! shuffle_x45 {
+        ($x01:expr, $x23:expr) => {
+            match (imm8 >> 4) & 0b11 {
+                0b00 => shuffle_x67!($x01, $x23, 0),
+                0b01 => shuffle_x67!($x01, $x23, 1),
+                0b10 => shuffle_x67!($x01, $x23, 2),
+                _ => shuffle_x67!($x01, $x23, 3),
+            }
+        }
+    }
+    macro_rules! shuffle_x23 {
+        ($x01:expr) => {
+            match (imm8 >> 2) & 0b11 {
+                0b00 => shuffle_x45!($x01, 0),
+                0b01 => shuffle_x45!($x01, 1),
+                0b10 => shuffle_x45!($x01, 2),
+                _ => shuffle_x45!($x01, 3),
+            }
+        }
+    }
+    match imm8 & 0b11 {
+        0b00 => shuffle_x23!(0),
+        0b01 => shuffle_x23!(1),
+        0b10 => shuffle_x23!(2),
+        _ => shuffle_x23!(3),
+    }
+}
+
+/// Shuffle eight 32-bit foating-point elements in `a` across lanes using
+/// the corresponding 32-bit integer index in `idx`.
+#[inline(always)]
+#[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpermps))]
+pub unsafe fn _mm256_permutevar8x32_ps(a: f32x8, idx: i32x8) -> f32x8 {
+    permps(a, idx)
+}
 
 /// Compute the absolute differences of packed unsigned 8-bit integers in `a`
 /// and `b`, then horizontally sum each consecutive 8 differences to
@@ -1760,8 +1860,115 @@ macro_rules! shuffle_x23 {
     }
 }
 
-// TODO _mm256_shufflehi_epi16 (__m256i a, const int imm8)
-// TODO _mm256_shufflelo_epi16 (__m256i a, const int imm8)
+/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of `a` using
+/// the control in `imm8`. The low 64 bits of 128-bit lanes of `a` are copied
+/// to the output.
+#[inline(always)]
+#[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpshufhw, imm8 = 9))]
+pub unsafe fn _mm256_shufflehi_epi16(a: i16x16, imm8: i32) -> i16x16 {
+    let imm8 = (imm8 & 0xFF) as u8;
+    macro_rules! shuffle_done {
+        ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
+            #[cfg_attr(rustfmt, rustfmt_skip)]
+            simd_shuffle16(a, a, [
+                0, 1, 2, 3, 4+$x01, 4+$x23, 4+$x45, 4+$x67,
+                8, 9, 10, 11, 12+$x01, 12+$x23, 12+$x45, 12+$x67
+            ]);
+        }
+    }
+    macro_rules! shuffle_x67 {
+        ($x01:expr, $x23:expr, $x45:expr) => {
+            match (imm8 >> 6) & 0b11 {
+                0b00 => shuffle_done!($x01, $x23, $x45, 0),
+                0b01 => shuffle_done!($x01, $x23, $x45, 1),
+                0b10 => shuffle_done!($x01, $x23, $x45, 2),
+                _ => shuffle_done!($x01, $x23, $x45, 3),
+            }
+        }
+    }
+    macro_rules! shuffle_x45 {
+        ($x01:expr, $x23:expr) => {
+            match (imm8 >> 4) & 0b11 {
+                0b00 => shuffle_x67!($x01, $x23, 0),
+                0b01 => shuffle_x67!($x01, $x23, 1),
+                0b10 => shuffle_x67!($x01, $x23, 2),
+                _ => shuffle_x67!($x01, $x23, 3),
+            }
+        }
+    }
+    macro_rules! shuffle_x23 {
+        ($x01:expr) => {
+            match (imm8 >> 2) & 0b11 {
+                0b00 => shuffle_x45!($x01, 0),
+                0b01 => shuffle_x45!($x01, 1),
+                0b10 => shuffle_x45!($x01, 2),
+                _ => shuffle_x45!($x01, 3),
+            }
+        }
+    }
+    match imm8 & 0b11 {
+        0b00 => shuffle_x23!(0),
+        0b01 => shuffle_x23!(1),
+        0b10 => shuffle_x23!(2),
+        _ => shuffle_x23!(3),
+    }
+}
+
+/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of `a` using
+/// the control in `imm8`. The high 64 bits of 128-bit lanes of `a` are copied
+/// to the output.
+#[inline(always)]
+#[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpshuflw, imm8 = 9))]
+pub unsafe fn _mm256_shufflelo_epi16(a: i16x16, imm8: i32) -> i16x16 {
+    let imm8 = (imm8 & 0xFF) as u8;
+    macro_rules! shuffle_done {
+        ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
+            #[cfg_attr(rustfmt, rustfmt_skip)]
+            simd_shuffle16(a, a, [
+                0+$x01, 0+$x23, 0+$x45, 0+$x67, 4, 5, 6, 7,
+                8+$x01, 8+$x23, 8+$x45, 8+$x67, 12, 13, 14, 15,
+            ]);
+        }
+    }
+    macro_rules! shuffle_x67 {
+        ($x01:expr, $x23:expr, $x45:expr) => {
+            match (imm8 >> 6) & 0b11 {
+                0b00 => shuffle_done!($x01, $x23, $x45, 0),
+                0b01 => shuffle_done!($x01, $x23, $x45, 1),
+                0b10 => shuffle_done!($x01, $x23, $x45, 2),
+                _ => shuffle_done!($x01, $x23, $x45, 3),
+            }
+        }
+    }
+    macro_rules! shuffle_x45 {
+        ($x01:expr, $x23:expr) => {
+            match (imm8 >> 4) & 0b11 {
+                0b00 => shuffle_x67!($x01, $x23, 0),
+                0b01 => shuffle_x67!($x01, $x23, 1),
+                0b10 => shuffle_x67!($x01, $x23, 2),
+                _ => shuffle_x67!($x01, $x23, 3),
+            }
+        }
+    }
+    macro_rules! shuffle_x23 {
+        ($x01:expr) => {
+            match (imm8 >> 2) & 0b11 {
+                0b00 => shuffle_x45!($x01, 0),
+                0b01 => shuffle_x45!($x01, 1),
+                0b10 => shuffle_x45!($x01, 2),
+                _ => shuffle_x45!($x01, 3),
+            }
+        }
+    }
+    match imm8 & 0b11 {
+        0b00 => shuffle_x23!(0),
+        0b01 => shuffle_x23!(1),
+        0b10 => shuffle_x23!(2),
+        _ => shuffle_x23!(3),
+    }
+}
 
 /// Negate packed 16-bit integers in `a` when the corresponding signed
 /// 16-bit integer in `b` is negative, and return the results.
@@ -2626,6 +2833,10 @@ pub unsafe fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
     fn pshufb(a: u8x32, b: u8x32) -> u8x32;
     #[link_name = "llvm.x86.avx2.permd"]
     fn permd(a: u32x8, b: u32x8) -> u32x8;
+    #[link_name = "llvm.x86.avx2.permps"]
+    fn permps(a: f32x8, b: i32x8) -> f32x8;
+    #[link_name = "llvm.x86.avx2.vperm2i128"]
+    fn vperm2i128(a: i64x4, b: i64x4, imm8: i8) -> i64x4;
     #[link_name = "llvm.x86.avx2.gather.d.d"]
     fn pgatherdd(
         src: i32x4, slice: *const i8, offsets: i32x4, mask: i32x4, scale: i8
@@ -2700,7 +2911,7 @@ mod tests {
     use v256::*;
     use v128::*;
     use x86::i586::avx2;
-    use x86::__m256i;
+    use x86::{__m128i, __m256i};
     use std;
 
     #[simd_test = "avx2"]
@@ -3306,6 +3517,14 @@ unsafe fn _mm256_cvtepu8_epi64() {
         assert_eq!(r, avx2::_mm256_cvtepu8_epi64(a));
     }
 
+    #[simd_test = "avx2"]
+    unsafe fn _mm256_extracti128_si256() {
+        let a = __m256i::from(i64x4::new(1, 2, 3, 4));
+        let r = avx2::_mm256_extracti128_si256(a, 0b01);
+        let e = __m128i::from(i64x2::new(3, 4));
+        assert_eq!(r, e);
+    }
+
     #[simd_test = "avx2"]
     unsafe fn _mm256_hadd_epi16() {
         let a = i16x16::splat(2);
@@ -3370,6 +3589,15 @@ unsafe fn _mm256_madd_epi16() {
         assert_eq!(r, e);
     }
 
+    #[simd_test = "avx2"]
+    unsafe fn _mm256_inserti128_si256() {
+        let a = __m256i::from(i64x4::new(1, 2, 3, 4));
+        let b = __m128i::from(i64x2::new(7, 8));
+        let r = avx2::_mm256_inserti128_si256(a, b, 0b01);
+        let e = i64x4::new(1, 2, 7, 8);
+        assert_eq!(r, __m256i::from(e));
+    }
+
     #[simd_test = "avx2"]
     unsafe fn _mm256_maddubs_epi16() {
         let a = u8x32::splat(2);
@@ -3704,6 +3932,38 @@ unsafe fn _mm256_sad_epu8() {
         assert_eq!(r, e);
     }
 
+    #[simd_test = "avx2"]
+    unsafe fn _mm256_shufflehi_epi16() {
+        #[cfg_attr(rustfmt, rustfmt_skip)]
+        let a = i16x16::new(
+            0, 1, 2, 3, 11, 22, 33, 44,
+            4, 5, 6, 7, 55, 66, 77, 88,
+        );
+        #[cfg_attr(rustfmt, rustfmt_skip)]
+        let e = i16x16::new(
+            0, 1, 2, 3, 44, 22, 22, 11,
+            4, 5, 6, 7, 88, 66, 66, 55,
+        );
+        let r = avx2::_mm256_shufflehi_epi16(a, 0b00_01_01_11);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx2"]
+    unsafe fn _mm256_shufflelo_epi16() {
+        #[cfg_attr(rustfmt, rustfmt_skip)]
+        let a = i16x16::new(
+            11, 22, 33, 44, 0, 1, 2, 3,
+            55, 66, 77, 88, 4, 5, 6, 7,
+        );
+        #[cfg_attr(rustfmt, rustfmt_skip)]
+        let e = i16x16::new(
+            44, 22, 22, 11, 0, 1, 2, 3,
+            88, 66, 66, 55, 4, 5, 6, 7,
+        );
+        let r = avx2::_mm256_shufflelo_epi16(a, 0b00_01_01_11);
+        assert_eq!(r, e);
+    }
+
     #[simd_test = "avx2"]
     unsafe fn _mm256_sign_epi16() {
         let a = i16x16::splat(2);
@@ -4119,6 +4379,32 @@ unsafe fn _mm256_permute4x64_epi64() {
         assert_eq!(r, expected);
     }
 
+    #[simd_test = "avx2"]
+    unsafe fn _mm256_permute2x128_si256() {
+        let a = __m256i::from(i64x4::new(100, 200, 500, 600));
+        let b = __m256i::from(i64x4::new(300, 400, 700, 800));
+        let r = avx2::_mm256_permute2x128_si256(a, b, 0b00_01_00_11);
+        let e = i64x4::new(700, 800, 500, 600);
+        assert_eq!(i64x4::from(r), e);
+    }
+
+    #[simd_test = "avx2"]
+    unsafe fn _mm256_permute4x64_pd() {
+        let a = f64x4::new(1., 2., 3., 4.);
+        let r = avx2::_mm256_permute4x64_pd(a, 0b00_01_00_11);
+        let e = f64x4::new(4., 1., 2., 1.);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx2"]
+    unsafe fn _mm256_permutevar8x32_ps() {
+        let a = f32x8::new(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = i32x8::new(5, 0, 5, 1, 7, 6, 3, 4);
+        let r = avx2::_mm256_permutevar8x32_ps(a, b);
+        let e = f32x8::new(6., 1., 6., 2., 8., 7., 4., 5.);
+        assert_eq!(r, e);
+    }
+
     #[simd_test = "avx2"]
     unsafe fn _mm_i32gather_epi32() {
         let mut arr = [0i32; 128];