From 40a0b1cc924ea541452a263ca47148f1047efb7f Mon Sep 17 00:00:00 2001 From: Tony Sifkarovski Date: Sun, 26 Nov 2017 11:40:26 -0500 Subject: [PATCH] [avx2] add shuffle, insert/extract i128, permute* (#210) * [x86][avx2] add _mm256_shuffle{hi,lo}_epi16 * [x86][avx2] add _mm256_{insert,extract}i128_si256 * [x86][avx2] add remaining permute intrinsics --- library/stdarch/coresimd/src/x86/i586/avx2.rs | 304 +++++++++++++++++- 1 file changed, 295 insertions(+), 9 deletions(-) diff --git a/library/stdarch/coresimd/src/x86/i586/avx2.rs b/library/stdarch/coresimd/src/x86/i586/avx2.rs index 7bf376b95f1e..81f836f15466 100644 --- a/library/stdarch/coresimd/src/x86/i586/avx2.rs +++ b/library/stdarch/coresimd/src/x86/i586/avx2.rs @@ -24,7 +24,7 @@ use v256::*; use v128::*; -use x86::__m256i; +use x86::{__m128i, __m256i}; #[cfg(test)] use stdsimd_test::assert_instr; @@ -643,7 +643,20 @@ pub unsafe fn _mm256_cvtepu8_epi64(a: u8x16) -> i64x4 { simd_cast::<::v32::u8x4, _>(simd_shuffle4(a, a, [0, 1, 2, 3])) } -// TODO _m128i _mm256_extracti128_si256 +/// Extract 128 bits (of integer data) from `a` selected with `imm8`. +#[inline(always)] +#[target_feature = "+avx2"] +#[cfg_attr(test, assert_instr(vextractf128, imm8 = 1))] +pub unsafe fn _mm256_extracti128_si256(a: __m256i, imm8: i32) -> __m128i { + use x86::i586::avx::_mm256_undefined_si256; + let imm8 = (imm8 & 0xFF) as u8; + let b = i64x4::from(_mm256_undefined_si256()); + let dst: i64x2 = match imm8 & 0b01 { + 0 => simd_shuffle2(i64x4::from(a), b, [0, 1]), + _ => simd_shuffle2(i64x4::from(a), b, [2, 3]), + }; + __m128i::from(dst) +} /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`. #[inline(always)] @@ -1191,7 +1204,23 @@ macro_rules! call { constify_imm8!(scale, call) } -// TODO _mm256_inserti128_si256 +/// Copy `a` to `dst`, then insert 128 bits (of integer data) from `b` at the +/// location specified by `imm8`. +#[inline(always)] +#[target_feature = "+avx2"] +#[cfg_attr(test, assert_instr(vinsertf128, imm8 = 1))] +pub unsafe fn _mm256_inserti128_si256( + a: __m256i, b: __m128i, imm8: i32 +) -> __m256i { + use x86::i586::avx::_mm256_castsi128_si256; + let imm8 = (imm8 & 0b01) as u8; + let b = i64x4::from(_mm256_castsi128_si256(b)); + let dst: i64x4 = match imm8 & 0b01 { + 0 => simd_shuffle4(i64x4::from(a), b, [4, 5, 2, 3]), + _ => simd_shuffle4(i64x4::from(a), b, [0, 1, 4, 5]), + }; + __m256i::from(dst) +} /// Multiply packed signed 16-bit integers in `a` and `b`, producing /// intermediate signed 32-bit integers. Horizontally add adjacent pairs @@ -1616,9 +1645,80 @@ macro_rules! permute1 { } } -// TODO _mm256_permute2x128_si256 (__m256i a, __m256i b, const int imm8) -// TODO _mm256_permute4x64_pd (__m256d a, const int imm8) -// TODO _mm256_permutevar8x32_ps (__m256 a, __m256i idx) +/// Shuffle 128-bits of integer data selected by `imm8` from `a` and `b`. +#[inline(always)] +#[target_feature = "+avx2"] +#[cfg_attr(test, assert_instr(vperm2f128, imm8 = 9))] +pub unsafe fn _mm256_permute2x128_si256( + a: __m256i, b: __m256i, imm8: i32 +) -> __m256i { + macro_rules! call { + ($imm8:expr) => { + __m256i::from(vperm2i128(i64x4::from(a), i64x4::from(b), $imm8)) + } + } + constify_imm8!(imm8, call) +} + +/// Shuffle 64-bit floating-point elements in `a` across lanes using the +/// control in `imm8`. +#[inline(always)] +#[target_feature = "+avx2"] +#[cfg_attr(test, assert_instr(vpermpd, imm8 = 1))] +pub unsafe fn _mm256_permute4x64_pd(a: f64x4, imm8: i32) -> f64x4 { + use x86::i586::avx::_mm256_undefined_pd; + let imm8 = (imm8 & 0xFF) as u8; + macro_rules! shuffle_done { + ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => { + simd_shuffle4(a, _mm256_undefined_pd(), [$x01, $x23, $x45, $x67]) + } + } + macro_rules! shuffle_x67 { + ($x01:expr, $x23:expr, $x45:expr) => { + match (imm8 >> 6) & 0b11 { + 0b00 => shuffle_done!($x01, $x23, $x45, 0), + 0b01 => shuffle_done!($x01, $x23, $x45, 1), + 0b10 => shuffle_done!($x01, $x23, $x45, 2), + _ => shuffle_done!($x01, $x23, $x45, 3), + } + } + } + macro_rules! shuffle_x45 { + ($x01:expr, $x23:expr) => { + match (imm8 >> 4) & 0b11 { + 0b00 => shuffle_x67!($x01, $x23, 0), + 0b01 => shuffle_x67!($x01, $x23, 1), + 0b10 => shuffle_x67!($x01, $x23, 2), + _ => shuffle_x67!($x01, $x23, 3), + } + } + } + macro_rules! shuffle_x23 { + ($x01:expr) => { + match (imm8 >> 2) & 0b11 { + 0b00 => shuffle_x45!($x01, 0), + 0b01 => shuffle_x45!($x01, 1), + 0b10 => shuffle_x45!($x01, 2), + _ => shuffle_x45!($x01, 3), + } + } + } + match imm8 & 0b11 { + 0b00 => shuffle_x23!(0), + 0b01 => shuffle_x23!(1), + 0b10 => shuffle_x23!(2), + _ => shuffle_x23!(3), + } +} + +/// Shuffle eight 32-bit foating-point elements in `a` across lanes using +/// the corresponding 32-bit integer index in `idx`. +#[inline(always)] +#[target_feature = "+avx2"] +#[cfg_attr(test, assert_instr(vpermps))] +pub unsafe fn _mm256_permutevar8x32_ps(a: f32x8, idx: i32x8) -> f32x8 { + permps(a, idx) +} /// Compute the absolute differences of packed unsigned 8-bit integers in `a` /// and `b`, then horizontally sum each consecutive 8 differences to @@ -1760,8 +1860,115 @@ macro_rules! shuffle_x23 { } } -// TODO _mm256_shufflehi_epi16 (__m256i a, const int imm8) -// TODO _mm256_shufflelo_epi16 (__m256i a, const int imm8) +/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of `a` using +/// the control in `imm8`. The low 64 bits of 128-bit lanes of `a` are copied +/// to the output. +#[inline(always)] +#[target_feature = "+avx2"] +#[cfg_attr(test, assert_instr(vpshufhw, imm8 = 9))] +pub unsafe fn _mm256_shufflehi_epi16(a: i16x16, imm8: i32) -> i16x16 { + let imm8 = (imm8 & 0xFF) as u8; + macro_rules! shuffle_done { + ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => { + #[cfg_attr(rustfmt, rustfmt_skip)] + simd_shuffle16(a, a, [ + 0, 1, 2, 3, 4+$x01, 4+$x23, 4+$x45, 4+$x67, + 8, 9, 10, 11, 12+$x01, 12+$x23, 12+$x45, 12+$x67 + ]); + } + } + macro_rules! shuffle_x67 { + ($x01:expr, $x23:expr, $x45:expr) => { + match (imm8 >> 6) & 0b11 { + 0b00 => shuffle_done!($x01, $x23, $x45, 0), + 0b01 => shuffle_done!($x01, $x23, $x45, 1), + 0b10 => shuffle_done!($x01, $x23, $x45, 2), + _ => shuffle_done!($x01, $x23, $x45, 3), + } + } + } + macro_rules! shuffle_x45 { + ($x01:expr, $x23:expr) => { + match (imm8 >> 4) & 0b11 { + 0b00 => shuffle_x67!($x01, $x23, 0), + 0b01 => shuffle_x67!($x01, $x23, 1), + 0b10 => shuffle_x67!($x01, $x23, 2), + _ => shuffle_x67!($x01, $x23, 3), + } + } + } + macro_rules! shuffle_x23 { + ($x01:expr) => { + match (imm8 >> 2) & 0b11 { + 0b00 => shuffle_x45!($x01, 0), + 0b01 => shuffle_x45!($x01, 1), + 0b10 => shuffle_x45!($x01, 2), + _ => shuffle_x45!($x01, 3), + } + } + } + match imm8 & 0b11 { + 0b00 => shuffle_x23!(0), + 0b01 => shuffle_x23!(1), + 0b10 => shuffle_x23!(2), + _ => shuffle_x23!(3), + } +} + +/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of `a` using +/// the control in `imm8`. The high 64 bits of 128-bit lanes of `a` are copied +/// to the output. +#[inline(always)] +#[target_feature = "+avx2"] +#[cfg_attr(test, assert_instr(vpshuflw, imm8 = 9))] +pub unsafe fn _mm256_shufflelo_epi16(a: i16x16, imm8: i32) -> i16x16 { + let imm8 = (imm8 & 0xFF) as u8; + macro_rules! shuffle_done { + ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => { + #[cfg_attr(rustfmt, rustfmt_skip)] + simd_shuffle16(a, a, [ + 0+$x01, 0+$x23, 0+$x45, 0+$x67, 4, 5, 6, 7, + 8+$x01, 8+$x23, 8+$x45, 8+$x67, 12, 13, 14, 15, + ]); + } + } + macro_rules! shuffle_x67 { + ($x01:expr, $x23:expr, $x45:expr) => { + match (imm8 >> 6) & 0b11 { + 0b00 => shuffle_done!($x01, $x23, $x45, 0), + 0b01 => shuffle_done!($x01, $x23, $x45, 1), + 0b10 => shuffle_done!($x01, $x23, $x45, 2), + _ => shuffle_done!($x01, $x23, $x45, 3), + } + } + } + macro_rules! shuffle_x45 { + ($x01:expr, $x23:expr) => { + match (imm8 >> 4) & 0b11 { + 0b00 => shuffle_x67!($x01, $x23, 0), + 0b01 => shuffle_x67!($x01, $x23, 1), + 0b10 => shuffle_x67!($x01, $x23, 2), + _ => shuffle_x67!($x01, $x23, 3), + } + } + } + macro_rules! shuffle_x23 { + ($x01:expr) => { + match (imm8 >> 2) & 0b11 { + 0b00 => shuffle_x45!($x01, 0), + 0b01 => shuffle_x45!($x01, 1), + 0b10 => shuffle_x45!($x01, 2), + _ => shuffle_x45!($x01, 3), + } + } + } + match imm8 & 0b11 { + 0b00 => shuffle_x23!(0), + 0b01 => shuffle_x23!(1), + 0b10 => shuffle_x23!(2), + _ => shuffle_x23!(3), + } +} /// Negate packed 16-bit integers in `a` when the corresponding signed /// 16-bit integer in `b` is negative, and return the results. @@ -2626,6 +2833,10 @@ pub unsafe fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i { fn pshufb(a: u8x32, b: u8x32) -> u8x32; #[link_name = "llvm.x86.avx2.permd"] fn permd(a: u32x8, b: u32x8) -> u32x8; + #[link_name = "llvm.x86.avx2.permps"] + fn permps(a: f32x8, b: i32x8) -> f32x8; + #[link_name = "llvm.x86.avx2.vperm2i128"] + fn vperm2i128(a: i64x4, b: i64x4, imm8: i8) -> i64x4; #[link_name = "llvm.x86.avx2.gather.d.d"] fn pgatherdd( src: i32x4, slice: *const i8, offsets: i32x4, mask: i32x4, scale: i8 @@ -2700,7 +2911,7 @@ mod tests { use v256::*; use v128::*; use x86::i586::avx2; - use x86::__m256i; + use x86::{__m128i, __m256i}; use std; #[simd_test = "avx2"] @@ -3306,6 +3517,14 @@ unsafe fn _mm256_cvtepu8_epi64() { assert_eq!(r, avx2::_mm256_cvtepu8_epi64(a)); } + #[simd_test = "avx2"] + unsafe fn _mm256_extracti128_si256() { + let a = __m256i::from(i64x4::new(1, 2, 3, 4)); + let r = avx2::_mm256_extracti128_si256(a, 0b01); + let e = __m128i::from(i64x2::new(3, 4)); + assert_eq!(r, e); + } + #[simd_test = "avx2"] unsafe fn _mm256_hadd_epi16() { let a = i16x16::splat(2); @@ -3370,6 +3589,15 @@ unsafe fn _mm256_madd_epi16() { assert_eq!(r, e); } + #[simd_test = "avx2"] + unsafe fn _mm256_inserti128_si256() { + let a = __m256i::from(i64x4::new(1, 2, 3, 4)); + let b = __m128i::from(i64x2::new(7, 8)); + let r = avx2::_mm256_inserti128_si256(a, b, 0b01); + let e = i64x4::new(1, 2, 7, 8); + assert_eq!(r, __m256i::from(e)); + } + #[simd_test = "avx2"] unsafe fn _mm256_maddubs_epi16() { let a = u8x32::splat(2); @@ -3704,6 +3932,38 @@ unsafe fn _mm256_sad_epu8() { assert_eq!(r, e); } + #[simd_test = "avx2"] + unsafe fn _mm256_shufflehi_epi16() { + #[cfg_attr(rustfmt, rustfmt_skip)] + let a = i16x16::new( + 0, 1, 2, 3, 11, 22, 33, 44, + 4, 5, 6, 7, 55, 66, 77, 88, + ); + #[cfg_attr(rustfmt, rustfmt_skip)] + let e = i16x16::new( + 0, 1, 2, 3, 44, 22, 22, 11, + 4, 5, 6, 7, 88, 66, 66, 55, + ); + let r = avx2::_mm256_shufflehi_epi16(a, 0b00_01_01_11); + assert_eq!(r, e); + } + + #[simd_test = "avx2"] + unsafe fn _mm256_shufflelo_epi16() { + #[cfg_attr(rustfmt, rustfmt_skip)] + let a = i16x16::new( + 11, 22, 33, 44, 0, 1, 2, 3, + 55, 66, 77, 88, 4, 5, 6, 7, + ); + #[cfg_attr(rustfmt, rustfmt_skip)] + let e = i16x16::new( + 44, 22, 22, 11, 0, 1, 2, 3, + 88, 66, 66, 55, 4, 5, 6, 7, + ); + let r = avx2::_mm256_shufflelo_epi16(a, 0b00_01_01_11); + assert_eq!(r, e); + } + #[simd_test = "avx2"] unsafe fn _mm256_sign_epi16() { let a = i16x16::splat(2); @@ -4119,6 +4379,32 @@ unsafe fn _mm256_permute4x64_epi64() { assert_eq!(r, expected); } + #[simd_test = "avx2"] + unsafe fn _mm256_permute2x128_si256() { + let a = __m256i::from(i64x4::new(100, 200, 500, 600)); + let b = __m256i::from(i64x4::new(300, 400, 700, 800)); + let r = avx2::_mm256_permute2x128_si256(a, b, 0b00_01_00_11); + let e = i64x4::new(700, 800, 500, 600); + assert_eq!(i64x4::from(r), e); + } + + #[simd_test = "avx2"] + unsafe fn _mm256_permute4x64_pd() { + let a = f64x4::new(1., 2., 3., 4.); + let r = avx2::_mm256_permute4x64_pd(a, 0b00_01_00_11); + let e = f64x4::new(4., 1., 2., 1.); + assert_eq!(r, e); + } + + #[simd_test = "avx2"] + unsafe fn _mm256_permutevar8x32_ps() { + let a = f32x8::new(1., 2., 3., 4., 5., 6., 7., 8.); + let b = i32x8::new(5, 0, 5, 1, 7, 6, 3, 4); + let r = avx2::_mm256_permutevar8x32_ps(a, b); + let e = f32x8::new(6., 1., 6., 2., 8., 7., 4., 5.); + assert_eq!(r, e); + } + #[simd_test = "avx2"] unsafe fn _mm_i32gather_epi32() { let mut arr = [0i32; 128];