[avx2] add shuffle, insert/extract i128, permute* (#210)

* [x86][avx2] add _mm256_shuffle{hi,lo}_epi16
* [x86][avx2] add _mm256_{insert,extract}i128_si256
* [x86][avx2] add remaining permute intrinsics
This commit is contained in:
Tony Sifkarovski
2017-11-26 11:40:26 -05:00
committed by gnzlbg
parent 426621f021
commit 40a0b1cc92
+295 -9
View File
@@ -24,7 +24,7 @@
use v256::*;
use v128::*;
use x86::__m256i;
use x86::{__m128i, __m256i};
#[cfg(test)]
use stdsimd_test::assert_instr;
@@ -643,7 +643,20 @@ pub unsafe fn _mm256_cvtepu8_epi64(a: u8x16) -> i64x4 {
simd_cast::<::v32::u8x4, _>(simd_shuffle4(a, a, [0, 1, 2, 3]))
}
// TODO _m128i _mm256_extracti128_si256
/// Extract 128 bits (of integer data) from `a` selected with `imm8`.
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vextractf128, imm8 = 1))]
pub unsafe fn _mm256_extracti128_si256(a: __m256i, imm8: i32) -> __m128i {
use x86::i586::avx::_mm256_undefined_si256;
let imm8 = (imm8 & 0xFF) as u8;
let b = i64x4::from(_mm256_undefined_si256());
let dst: i64x2 = match imm8 & 0b01 {
0 => simd_shuffle2(i64x4::from(a), b, [0, 1]),
_ => simd_shuffle2(i64x4::from(a), b, [2, 3]),
};
__m128i::from(dst)
}
/// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`.
#[inline(always)]
@@ -1191,7 +1204,23 @@ macro_rules! call {
constify_imm8!(scale, call)
}
// TODO _mm256_inserti128_si256
/// Copy `a` to `dst`, then insert 128 bits (of integer data) from `b` at the
/// location specified by `imm8`.
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vinsertf128, imm8 = 1))]
pub unsafe fn _mm256_inserti128_si256(
a: __m256i, b: __m128i, imm8: i32
) -> __m256i {
use x86::i586::avx::_mm256_castsi128_si256;
let imm8 = (imm8 & 0b01) as u8;
let b = i64x4::from(_mm256_castsi128_si256(b));
let dst: i64x4 = match imm8 & 0b01 {
0 => simd_shuffle4(i64x4::from(a), b, [4, 5, 2, 3]),
_ => simd_shuffle4(i64x4::from(a), b, [0, 1, 4, 5]),
};
__m256i::from(dst)
}
/// Multiply packed signed 16-bit integers in `a` and `b`, producing
/// intermediate signed 32-bit integers. Horizontally add adjacent pairs
@@ -1616,9 +1645,80 @@ macro_rules! permute1 {
}
}
// TODO _mm256_permute2x128_si256 (__m256i a, __m256i b, const int imm8)
// TODO _mm256_permute4x64_pd (__m256d a, const int imm8)
// TODO _mm256_permutevar8x32_ps (__m256 a, __m256i idx)
/// Shuffle 128-bits of integer data selected by `imm8` from `a` and `b`.
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vperm2f128, imm8 = 9))]
pub unsafe fn _mm256_permute2x128_si256(
a: __m256i, b: __m256i, imm8: i32
) -> __m256i {
macro_rules! call {
($imm8:expr) => {
__m256i::from(vperm2i128(i64x4::from(a), i64x4::from(b), $imm8))
}
}
constify_imm8!(imm8, call)
}
/// Shuffle 64-bit floating-point elements in `a` across lanes using the
/// control in `imm8`.
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpermpd, imm8 = 1))]
pub unsafe fn _mm256_permute4x64_pd(a: f64x4, imm8: i32) -> f64x4 {
use x86::i586::avx::_mm256_undefined_pd;
let imm8 = (imm8 & 0xFF) as u8;
macro_rules! shuffle_done {
($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
simd_shuffle4(a, _mm256_undefined_pd(), [$x01, $x23, $x45, $x67])
}
}
macro_rules! shuffle_x67 {
($x01:expr, $x23:expr, $x45:expr) => {
match (imm8 >> 6) & 0b11 {
0b00 => shuffle_done!($x01, $x23, $x45, 0),
0b01 => shuffle_done!($x01, $x23, $x45, 1),
0b10 => shuffle_done!($x01, $x23, $x45, 2),
_ => shuffle_done!($x01, $x23, $x45, 3),
}
}
}
macro_rules! shuffle_x45 {
($x01:expr, $x23:expr) => {
match (imm8 >> 4) & 0b11 {
0b00 => shuffle_x67!($x01, $x23, 0),
0b01 => shuffle_x67!($x01, $x23, 1),
0b10 => shuffle_x67!($x01, $x23, 2),
_ => shuffle_x67!($x01, $x23, 3),
}
}
}
macro_rules! shuffle_x23 {
($x01:expr) => {
match (imm8 >> 2) & 0b11 {
0b00 => shuffle_x45!($x01, 0),
0b01 => shuffle_x45!($x01, 1),
0b10 => shuffle_x45!($x01, 2),
_ => shuffle_x45!($x01, 3),
}
}
}
match imm8 & 0b11 {
0b00 => shuffle_x23!(0),
0b01 => shuffle_x23!(1),
0b10 => shuffle_x23!(2),
_ => shuffle_x23!(3),
}
}
/// Shuffle eight 32-bit foating-point elements in `a` across lanes using
/// the corresponding 32-bit integer index in `idx`.
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpermps))]
pub unsafe fn _mm256_permutevar8x32_ps(a: f32x8, idx: i32x8) -> f32x8 {
permps(a, idx)
}
/// Compute the absolute differences of packed unsigned 8-bit integers in `a`
/// and `b`, then horizontally sum each consecutive 8 differences to
@@ -1760,8 +1860,115 @@ macro_rules! shuffle_x23 {
}
}
// TODO _mm256_shufflehi_epi16 (__m256i a, const int imm8)
// TODO _mm256_shufflelo_epi16 (__m256i a, const int imm8)
/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of `a` using
/// the control in `imm8`. The low 64 bits of 128-bit lanes of `a` are copied
/// to the output.
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpshufhw, imm8 = 9))]
pub unsafe fn _mm256_shufflehi_epi16(a: i16x16, imm8: i32) -> i16x16 {
let imm8 = (imm8 & 0xFF) as u8;
macro_rules! shuffle_done {
($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
#[cfg_attr(rustfmt, rustfmt_skip)]
simd_shuffle16(a, a, [
0, 1, 2, 3, 4+$x01, 4+$x23, 4+$x45, 4+$x67,
8, 9, 10, 11, 12+$x01, 12+$x23, 12+$x45, 12+$x67
]);
}
}
macro_rules! shuffle_x67 {
($x01:expr, $x23:expr, $x45:expr) => {
match (imm8 >> 6) & 0b11 {
0b00 => shuffle_done!($x01, $x23, $x45, 0),
0b01 => shuffle_done!($x01, $x23, $x45, 1),
0b10 => shuffle_done!($x01, $x23, $x45, 2),
_ => shuffle_done!($x01, $x23, $x45, 3),
}
}
}
macro_rules! shuffle_x45 {
($x01:expr, $x23:expr) => {
match (imm8 >> 4) & 0b11 {
0b00 => shuffle_x67!($x01, $x23, 0),
0b01 => shuffle_x67!($x01, $x23, 1),
0b10 => shuffle_x67!($x01, $x23, 2),
_ => shuffle_x67!($x01, $x23, 3),
}
}
}
macro_rules! shuffle_x23 {
($x01:expr) => {
match (imm8 >> 2) & 0b11 {
0b00 => shuffle_x45!($x01, 0),
0b01 => shuffle_x45!($x01, 1),
0b10 => shuffle_x45!($x01, 2),
_ => shuffle_x45!($x01, 3),
}
}
}
match imm8 & 0b11 {
0b00 => shuffle_x23!(0),
0b01 => shuffle_x23!(1),
0b10 => shuffle_x23!(2),
_ => shuffle_x23!(3),
}
}
/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of `a` using
/// the control in `imm8`. The high 64 bits of 128-bit lanes of `a` are copied
/// to the output.
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpshuflw, imm8 = 9))]
pub unsafe fn _mm256_shufflelo_epi16(a: i16x16, imm8: i32) -> i16x16 {
let imm8 = (imm8 & 0xFF) as u8;
macro_rules! shuffle_done {
($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
#[cfg_attr(rustfmt, rustfmt_skip)]
simd_shuffle16(a, a, [
0+$x01, 0+$x23, 0+$x45, 0+$x67, 4, 5, 6, 7,
8+$x01, 8+$x23, 8+$x45, 8+$x67, 12, 13, 14, 15,
]);
}
}
macro_rules! shuffle_x67 {
($x01:expr, $x23:expr, $x45:expr) => {
match (imm8 >> 6) & 0b11 {
0b00 => shuffle_done!($x01, $x23, $x45, 0),
0b01 => shuffle_done!($x01, $x23, $x45, 1),
0b10 => shuffle_done!($x01, $x23, $x45, 2),
_ => shuffle_done!($x01, $x23, $x45, 3),
}
}
}
macro_rules! shuffle_x45 {
($x01:expr, $x23:expr) => {
match (imm8 >> 4) & 0b11 {
0b00 => shuffle_x67!($x01, $x23, 0),
0b01 => shuffle_x67!($x01, $x23, 1),
0b10 => shuffle_x67!($x01, $x23, 2),
_ => shuffle_x67!($x01, $x23, 3),
}
}
}
macro_rules! shuffle_x23 {
($x01:expr) => {
match (imm8 >> 2) & 0b11 {
0b00 => shuffle_x45!($x01, 0),
0b01 => shuffle_x45!($x01, 1),
0b10 => shuffle_x45!($x01, 2),
_ => shuffle_x45!($x01, 3),
}
}
}
match imm8 & 0b11 {
0b00 => shuffle_x23!(0),
0b01 => shuffle_x23!(1),
0b10 => shuffle_x23!(2),
_ => shuffle_x23!(3),
}
}
/// Negate packed 16-bit integers in `a` when the corresponding signed
/// 16-bit integer in `b` is negative, and return the results.
@@ -2626,6 +2833,10 @@ pub unsafe fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
fn pshufb(a: u8x32, b: u8x32) -> u8x32;
#[link_name = "llvm.x86.avx2.permd"]
fn permd(a: u32x8, b: u32x8) -> u32x8;
#[link_name = "llvm.x86.avx2.permps"]
fn permps(a: f32x8, b: i32x8) -> f32x8;
#[link_name = "llvm.x86.avx2.vperm2i128"]
fn vperm2i128(a: i64x4, b: i64x4, imm8: i8) -> i64x4;
#[link_name = "llvm.x86.avx2.gather.d.d"]
fn pgatherdd(
src: i32x4, slice: *const i8, offsets: i32x4, mask: i32x4, scale: i8
@@ -2700,7 +2911,7 @@ mod tests {
use v256::*;
use v128::*;
use x86::i586::avx2;
use x86::__m256i;
use x86::{__m128i, __m256i};
use std;
#[simd_test = "avx2"]
@@ -3306,6 +3517,14 @@ unsafe fn _mm256_cvtepu8_epi64() {
assert_eq!(r, avx2::_mm256_cvtepu8_epi64(a));
}
#[simd_test = "avx2"]
unsafe fn _mm256_extracti128_si256() {
let a = __m256i::from(i64x4::new(1, 2, 3, 4));
let r = avx2::_mm256_extracti128_si256(a, 0b01);
let e = __m128i::from(i64x2::new(3, 4));
assert_eq!(r, e);
}
#[simd_test = "avx2"]
unsafe fn _mm256_hadd_epi16() {
let a = i16x16::splat(2);
@@ -3370,6 +3589,15 @@ unsafe fn _mm256_madd_epi16() {
assert_eq!(r, e);
}
#[simd_test = "avx2"]
unsafe fn _mm256_inserti128_si256() {
let a = __m256i::from(i64x4::new(1, 2, 3, 4));
let b = __m128i::from(i64x2::new(7, 8));
let r = avx2::_mm256_inserti128_si256(a, b, 0b01);
let e = i64x4::new(1, 2, 7, 8);
assert_eq!(r, __m256i::from(e));
}
#[simd_test = "avx2"]
unsafe fn _mm256_maddubs_epi16() {
let a = u8x32::splat(2);
@@ -3704,6 +3932,38 @@ unsafe fn _mm256_sad_epu8() {
assert_eq!(r, e);
}
#[simd_test = "avx2"]
unsafe fn _mm256_shufflehi_epi16() {
#[cfg_attr(rustfmt, rustfmt_skip)]
let a = i16x16::new(
0, 1, 2, 3, 11, 22, 33, 44,
4, 5, 6, 7, 55, 66, 77, 88,
);
#[cfg_attr(rustfmt, rustfmt_skip)]
let e = i16x16::new(
0, 1, 2, 3, 44, 22, 22, 11,
4, 5, 6, 7, 88, 66, 66, 55,
);
let r = avx2::_mm256_shufflehi_epi16(a, 0b00_01_01_11);
assert_eq!(r, e);
}
#[simd_test = "avx2"]
unsafe fn _mm256_shufflelo_epi16() {
#[cfg_attr(rustfmt, rustfmt_skip)]
let a = i16x16::new(
11, 22, 33, 44, 0, 1, 2, 3,
55, 66, 77, 88, 4, 5, 6, 7,
);
#[cfg_attr(rustfmt, rustfmt_skip)]
let e = i16x16::new(
44, 22, 22, 11, 0, 1, 2, 3,
88, 66, 66, 55, 4, 5, 6, 7,
);
let r = avx2::_mm256_shufflelo_epi16(a, 0b00_01_01_11);
assert_eq!(r, e);
}
#[simd_test = "avx2"]
unsafe fn _mm256_sign_epi16() {
let a = i16x16::splat(2);
@@ -4119,6 +4379,32 @@ unsafe fn _mm256_permute4x64_epi64() {
assert_eq!(r, expected);
}
#[simd_test = "avx2"]
unsafe fn _mm256_permute2x128_si256() {
let a = __m256i::from(i64x4::new(100, 200, 500, 600));
let b = __m256i::from(i64x4::new(300, 400, 700, 800));
let r = avx2::_mm256_permute2x128_si256(a, b, 0b00_01_00_11);
let e = i64x4::new(700, 800, 500, 600);
assert_eq!(i64x4::from(r), e);
}
#[simd_test = "avx2"]
unsafe fn _mm256_permute4x64_pd() {
let a = f64x4::new(1., 2., 3., 4.);
let r = avx2::_mm256_permute4x64_pd(a, 0b00_01_00_11);
let e = f64x4::new(4., 1., 2., 1.);
assert_eq!(r, e);
}
#[simd_test = "avx2"]
unsafe fn _mm256_permutevar8x32_ps() {
let a = f32x8::new(1., 2., 3., 4., 5., 6., 7., 8.);
let b = i32x8::new(5, 0, 5, 1, 7, 6, 3, 4);
let r = avx2::_mm256_permutevar8x32_ps(a, b);
let e = f32x8::new(6., 1., 6., 2., 8., 7., 4., 5.);
assert_eq!(r, e);
}
#[simd_test = "avx2"]
unsafe fn _mm_i32gather_epi32() {
let mut arr = [0i32; 128];