mirror of
https://github.com/rust-lang/rust.git
synced 2026-05-21 17:52:12 +03:00
[avx2] add shuffle, insert/extract i128, permute* (#210)
* [x86][avx2] add _mm256_shuffle{hi,lo}_epi16
* [x86][avx2] add _mm256_{insert,extract}i128_si256
* [x86][avx2] add remaining permute intrinsics
This commit is contained in:
@@ -24,7 +24,7 @@
|
||||
|
||||
use v256::*;
|
||||
use v128::*;
|
||||
use x86::__m256i;
|
||||
use x86::{__m128i, __m256i};
|
||||
|
||||
#[cfg(test)]
|
||||
use stdsimd_test::assert_instr;
|
||||
@@ -643,7 +643,20 @@ pub unsafe fn _mm256_cvtepu8_epi64(a: u8x16) -> i64x4 {
|
||||
simd_cast::<::v32::u8x4, _>(simd_shuffle4(a, a, [0, 1, 2, 3]))
|
||||
}
|
||||
|
||||
// TODO _m128i _mm256_extracti128_si256
|
||||
/// Extract 128 bits (of integer data) from `a` selected with `imm8`.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+avx2"]
|
||||
#[cfg_attr(test, assert_instr(vextractf128, imm8 = 1))]
|
||||
pub unsafe fn _mm256_extracti128_si256(a: __m256i, imm8: i32) -> __m128i {
|
||||
use x86::i586::avx::_mm256_undefined_si256;
|
||||
let imm8 = (imm8 & 0xFF) as u8;
|
||||
let b = i64x4::from(_mm256_undefined_si256());
|
||||
let dst: i64x2 = match imm8 & 0b01 {
|
||||
0 => simd_shuffle2(i64x4::from(a), b, [0, 1]),
|
||||
_ => simd_shuffle2(i64x4::from(a), b, [2, 3]),
|
||||
};
|
||||
__m128i::from(dst)
|
||||
}
|
||||
|
||||
/// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`.
|
||||
#[inline(always)]
|
||||
@@ -1191,7 +1204,23 @@ macro_rules! call {
|
||||
constify_imm8!(scale, call)
|
||||
}
|
||||
|
||||
// TODO _mm256_inserti128_si256
|
||||
/// Copy `a` to `dst`, then insert 128 bits (of integer data) from `b` at the
|
||||
/// location specified by `imm8`.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+avx2"]
|
||||
#[cfg_attr(test, assert_instr(vinsertf128, imm8 = 1))]
|
||||
pub unsafe fn _mm256_inserti128_si256(
|
||||
a: __m256i, b: __m128i, imm8: i32
|
||||
) -> __m256i {
|
||||
use x86::i586::avx::_mm256_castsi128_si256;
|
||||
let imm8 = (imm8 & 0b01) as u8;
|
||||
let b = i64x4::from(_mm256_castsi128_si256(b));
|
||||
let dst: i64x4 = match imm8 & 0b01 {
|
||||
0 => simd_shuffle4(i64x4::from(a), b, [4, 5, 2, 3]),
|
||||
_ => simd_shuffle4(i64x4::from(a), b, [0, 1, 4, 5]),
|
||||
};
|
||||
__m256i::from(dst)
|
||||
}
|
||||
|
||||
/// Multiply packed signed 16-bit integers in `a` and `b`, producing
|
||||
/// intermediate signed 32-bit integers. Horizontally add adjacent pairs
|
||||
@@ -1616,9 +1645,80 @@ macro_rules! permute1 {
|
||||
}
|
||||
}
|
||||
|
||||
// TODO _mm256_permute2x128_si256 (__m256i a, __m256i b, const int imm8)
|
||||
// TODO _mm256_permute4x64_pd (__m256d a, const int imm8)
|
||||
// TODO _mm256_permutevar8x32_ps (__m256 a, __m256i idx)
|
||||
/// Shuffle 128-bits of integer data selected by `imm8` from `a` and `b`.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+avx2"]
|
||||
#[cfg_attr(test, assert_instr(vperm2f128, imm8 = 9))]
|
||||
pub unsafe fn _mm256_permute2x128_si256(
|
||||
a: __m256i, b: __m256i, imm8: i32
|
||||
) -> __m256i {
|
||||
macro_rules! call {
|
||||
($imm8:expr) => {
|
||||
__m256i::from(vperm2i128(i64x4::from(a), i64x4::from(b), $imm8))
|
||||
}
|
||||
}
|
||||
constify_imm8!(imm8, call)
|
||||
}
|
||||
|
||||
/// Shuffle 64-bit floating-point elements in `a` across lanes using the
|
||||
/// control in `imm8`.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+avx2"]
|
||||
#[cfg_attr(test, assert_instr(vpermpd, imm8 = 1))]
|
||||
pub unsafe fn _mm256_permute4x64_pd(a: f64x4, imm8: i32) -> f64x4 {
|
||||
use x86::i586::avx::_mm256_undefined_pd;
|
||||
let imm8 = (imm8 & 0xFF) as u8;
|
||||
macro_rules! shuffle_done {
|
||||
($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
|
||||
simd_shuffle4(a, _mm256_undefined_pd(), [$x01, $x23, $x45, $x67])
|
||||
}
|
||||
}
|
||||
macro_rules! shuffle_x67 {
|
||||
($x01:expr, $x23:expr, $x45:expr) => {
|
||||
match (imm8 >> 6) & 0b11 {
|
||||
0b00 => shuffle_done!($x01, $x23, $x45, 0),
|
||||
0b01 => shuffle_done!($x01, $x23, $x45, 1),
|
||||
0b10 => shuffle_done!($x01, $x23, $x45, 2),
|
||||
_ => shuffle_done!($x01, $x23, $x45, 3),
|
||||
}
|
||||
}
|
||||
}
|
||||
macro_rules! shuffle_x45 {
|
||||
($x01:expr, $x23:expr) => {
|
||||
match (imm8 >> 4) & 0b11 {
|
||||
0b00 => shuffle_x67!($x01, $x23, 0),
|
||||
0b01 => shuffle_x67!($x01, $x23, 1),
|
||||
0b10 => shuffle_x67!($x01, $x23, 2),
|
||||
_ => shuffle_x67!($x01, $x23, 3),
|
||||
}
|
||||
}
|
||||
}
|
||||
macro_rules! shuffle_x23 {
|
||||
($x01:expr) => {
|
||||
match (imm8 >> 2) & 0b11 {
|
||||
0b00 => shuffle_x45!($x01, 0),
|
||||
0b01 => shuffle_x45!($x01, 1),
|
||||
0b10 => shuffle_x45!($x01, 2),
|
||||
_ => shuffle_x45!($x01, 3),
|
||||
}
|
||||
}
|
||||
}
|
||||
match imm8 & 0b11 {
|
||||
0b00 => shuffle_x23!(0),
|
||||
0b01 => shuffle_x23!(1),
|
||||
0b10 => shuffle_x23!(2),
|
||||
_ => shuffle_x23!(3),
|
||||
}
|
||||
}
|
||||
|
||||
/// Shuffle eight 32-bit foating-point elements in `a` across lanes using
|
||||
/// the corresponding 32-bit integer index in `idx`.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+avx2"]
|
||||
#[cfg_attr(test, assert_instr(vpermps))]
|
||||
pub unsafe fn _mm256_permutevar8x32_ps(a: f32x8, idx: i32x8) -> f32x8 {
|
||||
permps(a, idx)
|
||||
}
|
||||
|
||||
/// Compute the absolute differences of packed unsigned 8-bit integers in `a`
|
||||
/// and `b`, then horizontally sum each consecutive 8 differences to
|
||||
@@ -1760,8 +1860,115 @@ macro_rules! shuffle_x23 {
|
||||
}
|
||||
}
|
||||
|
||||
// TODO _mm256_shufflehi_epi16 (__m256i a, const int imm8)
|
||||
// TODO _mm256_shufflelo_epi16 (__m256i a, const int imm8)
|
||||
/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of `a` using
|
||||
/// the control in `imm8`. The low 64 bits of 128-bit lanes of `a` are copied
|
||||
/// to the output.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+avx2"]
|
||||
#[cfg_attr(test, assert_instr(vpshufhw, imm8 = 9))]
|
||||
pub unsafe fn _mm256_shufflehi_epi16(a: i16x16, imm8: i32) -> i16x16 {
|
||||
let imm8 = (imm8 & 0xFF) as u8;
|
||||
macro_rules! shuffle_done {
|
||||
($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
simd_shuffle16(a, a, [
|
||||
0, 1, 2, 3, 4+$x01, 4+$x23, 4+$x45, 4+$x67,
|
||||
8, 9, 10, 11, 12+$x01, 12+$x23, 12+$x45, 12+$x67
|
||||
]);
|
||||
}
|
||||
}
|
||||
macro_rules! shuffle_x67 {
|
||||
($x01:expr, $x23:expr, $x45:expr) => {
|
||||
match (imm8 >> 6) & 0b11 {
|
||||
0b00 => shuffle_done!($x01, $x23, $x45, 0),
|
||||
0b01 => shuffle_done!($x01, $x23, $x45, 1),
|
||||
0b10 => shuffle_done!($x01, $x23, $x45, 2),
|
||||
_ => shuffle_done!($x01, $x23, $x45, 3),
|
||||
}
|
||||
}
|
||||
}
|
||||
macro_rules! shuffle_x45 {
|
||||
($x01:expr, $x23:expr) => {
|
||||
match (imm8 >> 4) & 0b11 {
|
||||
0b00 => shuffle_x67!($x01, $x23, 0),
|
||||
0b01 => shuffle_x67!($x01, $x23, 1),
|
||||
0b10 => shuffle_x67!($x01, $x23, 2),
|
||||
_ => shuffle_x67!($x01, $x23, 3),
|
||||
}
|
||||
}
|
||||
}
|
||||
macro_rules! shuffle_x23 {
|
||||
($x01:expr) => {
|
||||
match (imm8 >> 2) & 0b11 {
|
||||
0b00 => shuffle_x45!($x01, 0),
|
||||
0b01 => shuffle_x45!($x01, 1),
|
||||
0b10 => shuffle_x45!($x01, 2),
|
||||
_ => shuffle_x45!($x01, 3),
|
||||
}
|
||||
}
|
||||
}
|
||||
match imm8 & 0b11 {
|
||||
0b00 => shuffle_x23!(0),
|
||||
0b01 => shuffle_x23!(1),
|
||||
0b10 => shuffle_x23!(2),
|
||||
_ => shuffle_x23!(3),
|
||||
}
|
||||
}
|
||||
|
||||
/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of `a` using
|
||||
/// the control in `imm8`. The high 64 bits of 128-bit lanes of `a` are copied
|
||||
/// to the output.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+avx2"]
|
||||
#[cfg_attr(test, assert_instr(vpshuflw, imm8 = 9))]
|
||||
pub unsafe fn _mm256_shufflelo_epi16(a: i16x16, imm8: i32) -> i16x16 {
|
||||
let imm8 = (imm8 & 0xFF) as u8;
|
||||
macro_rules! shuffle_done {
|
||||
($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
simd_shuffle16(a, a, [
|
||||
0+$x01, 0+$x23, 0+$x45, 0+$x67, 4, 5, 6, 7,
|
||||
8+$x01, 8+$x23, 8+$x45, 8+$x67, 12, 13, 14, 15,
|
||||
]);
|
||||
}
|
||||
}
|
||||
macro_rules! shuffle_x67 {
|
||||
($x01:expr, $x23:expr, $x45:expr) => {
|
||||
match (imm8 >> 6) & 0b11 {
|
||||
0b00 => shuffle_done!($x01, $x23, $x45, 0),
|
||||
0b01 => shuffle_done!($x01, $x23, $x45, 1),
|
||||
0b10 => shuffle_done!($x01, $x23, $x45, 2),
|
||||
_ => shuffle_done!($x01, $x23, $x45, 3),
|
||||
}
|
||||
}
|
||||
}
|
||||
macro_rules! shuffle_x45 {
|
||||
($x01:expr, $x23:expr) => {
|
||||
match (imm8 >> 4) & 0b11 {
|
||||
0b00 => shuffle_x67!($x01, $x23, 0),
|
||||
0b01 => shuffle_x67!($x01, $x23, 1),
|
||||
0b10 => shuffle_x67!($x01, $x23, 2),
|
||||
_ => shuffle_x67!($x01, $x23, 3),
|
||||
}
|
||||
}
|
||||
}
|
||||
macro_rules! shuffle_x23 {
|
||||
($x01:expr) => {
|
||||
match (imm8 >> 2) & 0b11 {
|
||||
0b00 => shuffle_x45!($x01, 0),
|
||||
0b01 => shuffle_x45!($x01, 1),
|
||||
0b10 => shuffle_x45!($x01, 2),
|
||||
_ => shuffle_x45!($x01, 3),
|
||||
}
|
||||
}
|
||||
}
|
||||
match imm8 & 0b11 {
|
||||
0b00 => shuffle_x23!(0),
|
||||
0b01 => shuffle_x23!(1),
|
||||
0b10 => shuffle_x23!(2),
|
||||
_ => shuffle_x23!(3),
|
||||
}
|
||||
}
|
||||
|
||||
/// Negate packed 16-bit integers in `a` when the corresponding signed
|
||||
/// 16-bit integer in `b` is negative, and return the results.
|
||||
@@ -2626,6 +2833,10 @@ pub unsafe fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
|
||||
fn pshufb(a: u8x32, b: u8x32) -> u8x32;
|
||||
#[link_name = "llvm.x86.avx2.permd"]
|
||||
fn permd(a: u32x8, b: u32x8) -> u32x8;
|
||||
#[link_name = "llvm.x86.avx2.permps"]
|
||||
fn permps(a: f32x8, b: i32x8) -> f32x8;
|
||||
#[link_name = "llvm.x86.avx2.vperm2i128"]
|
||||
fn vperm2i128(a: i64x4, b: i64x4, imm8: i8) -> i64x4;
|
||||
#[link_name = "llvm.x86.avx2.gather.d.d"]
|
||||
fn pgatherdd(
|
||||
src: i32x4, slice: *const i8, offsets: i32x4, mask: i32x4, scale: i8
|
||||
@@ -2700,7 +2911,7 @@ mod tests {
|
||||
use v256::*;
|
||||
use v128::*;
|
||||
use x86::i586::avx2;
|
||||
use x86::__m256i;
|
||||
use x86::{__m128i, __m256i};
|
||||
use std;
|
||||
|
||||
#[simd_test = "avx2"]
|
||||
@@ -3306,6 +3517,14 @@ unsafe fn _mm256_cvtepu8_epi64() {
|
||||
assert_eq!(r, avx2::_mm256_cvtepu8_epi64(a));
|
||||
}
|
||||
|
||||
#[simd_test = "avx2"]
|
||||
unsafe fn _mm256_extracti128_si256() {
|
||||
let a = __m256i::from(i64x4::new(1, 2, 3, 4));
|
||||
let r = avx2::_mm256_extracti128_si256(a, 0b01);
|
||||
let e = __m128i::from(i64x2::new(3, 4));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test = "avx2"]
|
||||
unsafe fn _mm256_hadd_epi16() {
|
||||
let a = i16x16::splat(2);
|
||||
@@ -3370,6 +3589,15 @@ unsafe fn _mm256_madd_epi16() {
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test = "avx2"]
|
||||
unsafe fn _mm256_inserti128_si256() {
|
||||
let a = __m256i::from(i64x4::new(1, 2, 3, 4));
|
||||
let b = __m128i::from(i64x2::new(7, 8));
|
||||
let r = avx2::_mm256_inserti128_si256(a, b, 0b01);
|
||||
let e = i64x4::new(1, 2, 7, 8);
|
||||
assert_eq!(r, __m256i::from(e));
|
||||
}
|
||||
|
||||
#[simd_test = "avx2"]
|
||||
unsafe fn _mm256_maddubs_epi16() {
|
||||
let a = u8x32::splat(2);
|
||||
@@ -3704,6 +3932,38 @@ unsafe fn _mm256_sad_epu8() {
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test = "avx2"]
|
||||
unsafe fn _mm256_shufflehi_epi16() {
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let a = i16x16::new(
|
||||
0, 1, 2, 3, 11, 22, 33, 44,
|
||||
4, 5, 6, 7, 55, 66, 77, 88,
|
||||
);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let e = i16x16::new(
|
||||
0, 1, 2, 3, 44, 22, 22, 11,
|
||||
4, 5, 6, 7, 88, 66, 66, 55,
|
||||
);
|
||||
let r = avx2::_mm256_shufflehi_epi16(a, 0b00_01_01_11);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test = "avx2"]
|
||||
unsafe fn _mm256_shufflelo_epi16() {
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let a = i16x16::new(
|
||||
11, 22, 33, 44, 0, 1, 2, 3,
|
||||
55, 66, 77, 88, 4, 5, 6, 7,
|
||||
);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let e = i16x16::new(
|
||||
44, 22, 22, 11, 0, 1, 2, 3,
|
||||
88, 66, 66, 55, 4, 5, 6, 7,
|
||||
);
|
||||
let r = avx2::_mm256_shufflelo_epi16(a, 0b00_01_01_11);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test = "avx2"]
|
||||
unsafe fn _mm256_sign_epi16() {
|
||||
let a = i16x16::splat(2);
|
||||
@@ -4119,6 +4379,32 @@ unsafe fn _mm256_permute4x64_epi64() {
|
||||
assert_eq!(r, expected);
|
||||
}
|
||||
|
||||
#[simd_test = "avx2"]
|
||||
unsafe fn _mm256_permute2x128_si256() {
|
||||
let a = __m256i::from(i64x4::new(100, 200, 500, 600));
|
||||
let b = __m256i::from(i64x4::new(300, 400, 700, 800));
|
||||
let r = avx2::_mm256_permute2x128_si256(a, b, 0b00_01_00_11);
|
||||
let e = i64x4::new(700, 800, 500, 600);
|
||||
assert_eq!(i64x4::from(r), e);
|
||||
}
|
||||
|
||||
#[simd_test = "avx2"]
|
||||
unsafe fn _mm256_permute4x64_pd() {
|
||||
let a = f64x4::new(1., 2., 3., 4.);
|
||||
let r = avx2::_mm256_permute4x64_pd(a, 0b00_01_00_11);
|
||||
let e = f64x4::new(4., 1., 2., 1.);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test = "avx2"]
|
||||
unsafe fn _mm256_permutevar8x32_ps() {
|
||||
let a = f32x8::new(1., 2., 3., 4., 5., 6., 7., 8.);
|
||||
let b = i32x8::new(5, 0, 5, 1, 7, 6, 3, 4);
|
||||
let r = avx2::_mm256_permutevar8x32_ps(a, b);
|
||||
let e = f32x8::new(6., 1., 6., 2., 8., 7., 4., 5.);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test = "avx2"]
|
||||
unsafe fn _mm_i32gather_epi32() {
|
||||
let mut arr = [0i32; 128];
|
||||
|
||||
Reference in New Issue
Block a user