x86: add unsafe to all x86 vendor intrinsics

Also, add missing assert_instr tests to each intrinsic, where possible.
This commit is contained in:
Andrew Gallant
2017-09-26 21:53:50 -04:00
parent ff9e960628
commit 6dfc65289c
12 changed files with 1611 additions and 1213 deletions
+5 -3
View File
@@ -24,9 +24,11 @@ fn index(needle: &str, haystack: &str) -> usize {
haystack.resize(16, 0);
let vhaystack = vendor::__m128i::from(s::u8x16::load(&haystack, 0));
vendor::_mm_cmpestri(
vneedle, needle_len as i32, vhaystack, hay_len as i32,
vendor::_SIDD_CMP_EQUAL_ORDERED) as usize
unsafe {
vendor::_mm_cmpestri(
vneedle, needle_len as i32, vhaystack, hay_len as i32,
vendor::_SIDD_CMP_EQUAL_ORDERED) as usize
}
}
pub fn main() {
+8 -8
View File
@@ -19,7 +19,7 @@
#[inline(always)]
#[target_feature = "+lzcnt"]
#[cfg_attr(test, assert_instr(lzcnt))]
pub fn _lzcnt_u32(x: u32) -> u32 { x.leading_zeros() }
pub unsafe fn _lzcnt_u32(x: u32) -> u32 { x.leading_zeros() }
/// Counts the leading most significant zero bits.
///
@@ -27,19 +27,19 @@ pub fn _lzcnt_u32(x: u32) -> u32 { x.leading_zeros() }
#[inline(always)]
#[target_feature = "+lzcnt"]
#[cfg_attr(test, assert_instr(lzcnt))]
pub fn _lzcnt_u64(x: u64) -> u64 { x.leading_zeros() as u64 }
pub unsafe fn _lzcnt_u64(x: u64) -> u64 { x.leading_zeros() as u64 }
/// Counts the bits that are set.
#[inline(always)]
#[target_feature = "+popcnt"]
#[cfg_attr(test, assert_instr(popcnt))]
pub fn _popcnt32(x: u32) -> u32 { x.count_ones() }
pub unsafe fn _popcnt32(x: u32) -> u32 { x.count_ones() }
/// Counts the bits that are set.
#[inline(always)]
#[target_feature = "+popcnt"]
#[cfg_attr(test, assert_instr(popcnt))]
pub fn _popcnt64(x: u64) -> u64 { x.count_ones() as u64 }
pub unsafe fn _popcnt64(x: u64) -> u64 { x.count_ones() as u64 }
#[cfg(test)]
mod tests {
@@ -49,21 +49,21 @@ mod tests {
#[simd_test = "lzcnt"]
fn _lzcnt_u32() {
assert_eq!(abm::_lzcnt_u32(0b0101_1010u32), 25u32);
assert_eq!(unsafe { abm::_lzcnt_u32(0b0101_1010u32) }, 25u32);
}
#[simd_test = "lzcnt"]
fn _lzcnt_u64() {
assert_eq!(abm::_lzcnt_u64(0b0101_1010u64), 57u64);
assert_eq!(unsafe { abm::_lzcnt_u64(0b0101_1010u64) }, 57u64);
}
#[simd_test = "popcnt"]
fn _popcnt32() {
assert_eq!(abm::_popcnt32(0b0101_1010u32), 4);
assert_eq!(unsafe { abm::_popcnt32(0b0101_1010u32) }, 4);
}
#[simd_test = "popcnt"]
fn _popcnt64() {
assert_eq!(abm::_popcnt64(0b0101_1010u64), 4);
assert_eq!(unsafe { abm::_popcnt64(0b0101_1010u64) }, 4);
}
}
+35 -36
View File
@@ -1,14 +1,14 @@
use v256::*;
#[cfg(test)]
use stdsimd_test::assert_instr;
use v256::*;
/// Add packed double-precision (64-bit) floating-point elements
/// in `a` and `b`.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vaddpd))]
pub fn _mm256_add_pd(a: f64x4, b: f64x4) -> f64x4 {
pub unsafe fn _mm256_add_pd(a: f64x4, b: f64x4) -> f64x4 {
a + b
}
@@ -16,7 +16,7 @@ pub fn _mm256_add_pd(a: f64x4, b: f64x4) -> f64x4 {
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vaddps))]
pub fn _mm256_add_ps(a: f32x8, b: f32x8) -> f32x8 {
pub unsafe fn _mm256_add_ps(a: f32x8, b: f32x8) -> f32x8 {
a + b
}
@@ -25,7 +25,7 @@ pub fn _mm256_add_ps(a: f32x8, b: f32x8) -> f32x8 {
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vmulpd))]
pub fn _mm256_mul_pd(a: f64x4, b: f64x4) -> f64x4 {
pub unsafe fn _mm256_mul_pd(a: f64x4, b: f64x4) -> f64x4 {
a * b
}
@@ -33,7 +33,7 @@ pub fn _mm256_mul_pd(a: f64x4, b: f64x4) -> f64x4 {
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vmulps))]
pub fn _mm256_mul_ps(a: f32x8, b: f32x8) -> f32x8 {
pub unsafe fn _mm256_mul_ps(a: f32x8, b: f32x8) -> f32x8 {
a * b
}
@@ -42,8 +42,8 @@ pub fn _mm256_mul_ps(a: f32x8, b: f32x8) -> f32x8 {
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vaddsubpd))]
pub fn _mm256_addsub_pd(a: f64x4, b: f64x4) -> f64x4 {
unsafe { addsubpd256(a, b) }
pub unsafe fn _mm256_addsub_pd(a: f64x4, b: f64x4) -> f64x4 {
addsubpd256(a, b)
}
/// Alternatively add and subtract packed single-precision (32-bit)
@@ -51,8 +51,8 @@ pub fn _mm256_addsub_pd(a: f64x4, b: f64x4) -> f64x4 {
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vaddsubps))]
pub fn _mm256_addsub_ps(a: f32x8, b: f32x8) -> f32x8 {
unsafe { addsubps256(a, b) }
pub unsafe fn _mm256_addsub_ps(a: f32x8, b: f32x8) -> f32x8 {
addsubps256(a, b)
}
/// Subtract packed double-precision (64-bit) floating-point elements in `b`
@@ -60,7 +60,7 @@ pub fn _mm256_addsub_ps(a: f32x8, b: f32x8) -> f32x8 {
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vsubpd))]
pub fn _mm256_sub_pd(a: f64x4, b: f64x4) -> f64x4 {
pub unsafe fn _mm256_sub_pd(a: f64x4, b: f64x4) -> f64x4 {
a - b
}
@@ -69,25 +69,24 @@ pub fn _mm256_sub_pd(a: f64x4, b: f64x4) -> f64x4 {
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vsubps))]
pub fn _mm256_sub_ps(a: f32x8, b: f32x8) -> f32x8 {
pub unsafe fn _mm256_sub_ps(a: f32x8, b: f32x8) -> f32x8 {
a - b
}
/// Round packed double-precision (64-bit) floating point elements in `a`
/// according to the flag `b`. The value of `b` may be as follows:
///
/// ```ignore
/// 0x00: Round to the nearest whole number.
/// 0x01: Round down, toward negative infinity.
/// 0x02: Round up, toward positive infinity.
/// 0x03: Truncate the values.
/// For a few additional values options, check the LLVM docs:
/// https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
/// ```
#[inline(always)]
#[target_feature = "+avx"]
pub fn _mm256_round_pd(a: f64x4, b: i32) -> f64x4 {
pub unsafe fn _mm256_round_pd(a: f64x4, b: i32) -> f64x4 {
macro_rules! call {
($imm8:expr) => {
unsafe { roundpd256(a, $imm8) }
}
($imm8:expr) => { roundpd256(a, $imm8) }
}
constify_imm8!(b, call)
}
@@ -96,7 +95,7 @@ macro_rules! call {
#[cfg_attr(test, assert_instr(vroundpd))]
#[target_feature = "+avx"]
fn test_mm256_round_pd(a: f64x4) -> f64x4 {
_mm256_round_pd(a, 0x3)
unsafe { _mm256_round_pd(a, 0x3) }
}
/// Round packed double-precision (64-bit) floating point elements in `a` toward
@@ -104,8 +103,8 @@ fn test_mm256_round_pd(a: f64x4) -> f64x4 {
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vroundpd))]
pub fn _mm256_ceil_pd(a: f64x4) -> f64x4 {
unsafe { roundpd256(a, 0x02) }
pub unsafe fn _mm256_ceil_pd(a: f64x4) -> f64x4 {
roundpd256(a, 0x02)
}
/// Round packed double-precision (64-bit) floating point elements in `a` toward
@@ -113,8 +112,8 @@ pub fn _mm256_ceil_pd(a: f64x4) -> f64x4 {
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vroundpd))]
pub fn _mm256_floor_pd(a: f64x4) -> f64x4 {
unsafe { roundpd256(a, 0x01) }
pub unsafe fn _mm256_floor_pd(a: f64x4) -> f64x4 {
roundpd256(a, 0x01)
}
/// LLVM intrinsics used in the above functions
@@ -139,7 +138,7 @@ mod tests {
fn _mm256_add_pd() {
let a = f64x4::new(1.0, 2.0, 3.0, 4.0);
let b = f64x4::new(5.0, 6.0, 7.0, 8.0);
let r = avx::_mm256_add_pd(a, b);
let r = unsafe { avx::_mm256_add_pd(a, b) };
let e = f64x4::new(6.0, 8.0, 10.0, 12.0);
assert_eq!(r, e);
}
@@ -148,7 +147,7 @@ fn _mm256_add_pd() {
fn _mm256_add_ps() {
let a = f32x8::new(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
let b = f32x8::new(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
let r = avx::_mm256_add_ps(a, b);
let r = unsafe { avx::_mm256_add_ps(a, b) };
let e = f32x8::new(10.0, 12.0, 14.0, 16.0, 18.0, 20.0, 22.0, 24.0);
assert_eq!(r, e);
}
@@ -157,7 +156,7 @@ fn _mm256_add_ps() {
fn _mm256_mul_pd() {
let a = f64x4::new(1.0, 2.0, 3.0, 4.0);
let b = f64x4::new(5.0, 6.0, 7.0, 8.0);
let r = avx::_mm256_mul_pd(a, b);
let r = unsafe { avx::_mm256_mul_pd(a, b) };
let e = f64x4::new(5.0, 12.0, 21.0, 32.0);
assert_eq!(r, e);
}
@@ -166,7 +165,7 @@ fn _mm256_mul_pd() {
fn _mm256_mul_ps() {
let a = f32x8::new(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
let b = f32x8::new(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
let r = avx::_mm256_mul_ps(a, b);
let r = unsafe { avx::_mm256_mul_ps(a, b) };
let e = f32x8::new(9.0, 20.0, 33.0, 48.0, 65.0, 84.0, 105.0, 128.0);
assert_eq!(r, e);
}
@@ -175,7 +174,7 @@ fn _mm256_mul_ps() {
fn _mm256_addsub_pd() {
let a = f64x4::new(1.0, 2.0, 3.0, 4.0);
let b = f64x4::new(5.0, 6.0, 7.0, 8.0);
let r = avx::_mm256_addsub_pd(a, b);
let r = unsafe { avx::_mm256_addsub_pd(a, b) };
let e = f64x4::new(-4.0, 8.0, -4.0, 12.0);
assert_eq!(r, e);
}
@@ -184,7 +183,7 @@ fn _mm256_addsub_pd() {
fn _mm256_addsub_ps() {
let a = f32x8::new(1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0);
let b = f32x8::new(5.0, 6.0, 7.0, 8.0, 5.0, 6.0, 7.0, 8.0);
let r = avx::_mm256_addsub_ps(a, b);
let r = unsafe { avx::_mm256_addsub_ps(a, b) };
let e = f32x8::new(-4.0, 8.0, -4.0, 12.0, -4.0, 8.0, -4.0, 12.0);
assert_eq!(r, e);
}
@@ -193,7 +192,7 @@ fn _mm256_addsub_ps() {
fn _mm256_sub_pd() {
let a = f64x4::new(1.0, 2.0, 3.0, 4.0);
let b = f64x4::new(5.0, 6.0, 7.0, 8.0);
let r = avx::_mm256_sub_pd(a, b);
let r = unsafe { avx::_mm256_sub_pd(a, b) };
let e = f64x4::new(-4.0,-4.0,-4.0,-4.0);
assert_eq!(r, e);
}
@@ -202,7 +201,7 @@ fn _mm256_sub_pd() {
fn _mm256_sub_ps() {
let a = f32x8::new(1.0, 2.0, 3.0, 4.0, -1.0, -2.0, -3.0, -4.0);
let b = f32x8::new(5.0, 6.0, 7.0, 8.0, 3.0, 2.0, 1.0, 0.0);
let r = avx::_mm256_sub_ps(a, b);
let r = unsafe { avx::_mm256_sub_ps(a, b) };
let e = f32x8::new(-4.0, -4.0, -4.0, -4.0, -4.0, -4.0, -4.0, -4.0);
assert_eq!(r, e);
}
@@ -210,9 +209,9 @@ fn _mm256_sub_ps() {
#[simd_test = "avx"]
fn _mm256_round_pd() {
let a = f64x4::new(1.55, 2.2, 3.99, -1.2);
let result_closest = avx::_mm256_round_pd(a, 0b00000000);
let result_down = avx::_mm256_round_pd(a, 0b00000001);
let result_up = avx::_mm256_round_pd(a, 0b00000010);
let result_closest = unsafe { avx::_mm256_round_pd(a, 0b00000000) };
let result_down = unsafe { avx::_mm256_round_pd(a, 0b00000001) };
let result_up = unsafe { avx::_mm256_round_pd(a, 0b00000010) };
let expected_closest = f64x4::new(2.0, 2.0, 4.0, -1.0);
let expected_down = f64x4::new(1.0, 2.0, 3.0, -2.0);
let expected_up = f64x4::new(2.0, 3.0, 4.0, -1.0);
@@ -224,7 +223,7 @@ fn _mm256_round_pd() {
#[simd_test = "avx"]
fn _mm256_floor_pd() {
let a = f64x4::new(1.55, 2.2, 3.99, -1.2);
let result_down = avx::_mm256_floor_pd(a);
let result_down = unsafe { avx::_mm256_floor_pd(a) };
let expected_down = f64x4::new(1.0, 2.0, 3.0, -2.0);
assert_eq!(result_down, expected_down);
}
@@ -232,7 +231,7 @@ fn _mm256_floor_pd() {
#[simd_test = "avx"]
fn _mm256_ceil_pd() {
let a = f64x4::new(1.55, 2.2, 3.99, -1.2);
let result_up = avx::_mm256_ceil_pd(a, );
let result_up = unsafe { avx::_mm256_ceil_pd(a) };
let expected_up = f64x4::new(2.0, 3.0, 4.0, -1.0);
assert_eq!(result_up, expected_up);
}
+316 -316
View File
@@ -9,31 +9,31 @@
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpabsd))]
pub fn _mm256_abs_epi32(a: i32x8) -> i32x8 {
unsafe { pabsd(a) }
pub unsafe fn _mm256_abs_epi32(a: i32x8) -> i32x8 {
pabsd(a)
}
/// Computes the absolute values of packed 16-bit integers in `a`.
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpabsw))]
pub fn _mm256_abs_epi16(a: i16x16) -> i16x16 {
unsafe { pabsw(a) }
pub unsafe fn _mm256_abs_epi16(a: i16x16) -> i16x16 {
pabsw(a)
}
/// Computes the absolute values of packed 8-bit integers in `a`.
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpabsb))]
pub fn _mm256_abs_epi8(a: i8x32) -> i8x32 {
unsafe { pabsb(a) }
pub unsafe fn _mm256_abs_epi8(a: i8x32) -> i8x32 {
pabsb(a)
}
/// Add packed 64-bit integers in `a` and `b`.
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpaddq))]
pub fn _mm256_add_epi64(a: i64x4, b: i64x4) -> i64x4 {
pub unsafe fn _mm256_add_epi64(a: i64x4, b: i64x4) -> i64x4 {
a + b
}
@@ -41,7 +41,7 @@ pub fn _mm256_add_epi64(a: i64x4, b: i64x4) -> i64x4 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpaddd))]
pub fn _mm256_add_epi32(a: i32x8, b: i32x8) -> i32x8 {
pub unsafe fn _mm256_add_epi32(a: i32x8, b: i32x8) -> i32x8 {
a + b
}
@@ -49,7 +49,7 @@ pub fn _mm256_add_epi32(a: i32x8, b: i32x8) -> i32x8 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpaddw))]
pub fn _mm256_add_epi16(a: i16x16, b: i16x16) -> i16x16 {
pub unsafe fn _mm256_add_epi16(a: i16x16, b: i16x16) -> i16x16 {
a + b
}
@@ -57,7 +57,7 @@ pub fn _mm256_add_epi16(a: i16x16, b: i16x16) -> i16x16 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpaddb))]
pub fn _mm256_add_epi8(a: i8x32, b: i8x32) -> i8x32 {
pub unsafe fn _mm256_add_epi8(a: i8x32, b: i8x32) -> i8x32 {
a + b
}
@@ -65,32 +65,32 @@ pub fn _mm256_add_epi8(a: i8x32, b: i8x32) -> i8x32 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpaddsb))]
pub fn _mm256_adds_epi8(a: i8x32, b: i8x32) -> i8x32 {
unsafe { paddsb(a, b) }
pub unsafe fn _mm256_adds_epi8(a: i8x32, b: i8x32) -> i8x32 {
paddsb(a, b)
}
/// Add packed 16-bit integers in `a` and `b` using saturation.
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpaddsw))]
pub fn _mm256_adds_epi16(a: i16x16, b: i16x16) -> i16x16 {
unsafe { paddsw(a, b) }
pub unsafe fn _mm256_adds_epi16(a: i16x16, b: i16x16) -> i16x16 {
paddsw(a, b)
}
/// Add packed unsigned 8-bit integers in `a` and `b` using saturation.
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpaddusb))]
pub fn _mm256_adds_epu8(a: u8x32, b: u8x32) -> u8x32 {
unsafe { paddusb(a, b) }
pub unsafe fn _mm256_adds_epu8(a: u8x32, b: u8x32) -> u8x32 {
paddusb(a, b)
}
/// Add packed unsigned 16-bit integers in `a` and `b` using saturation.
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpaddusw))]
pub fn _mm256_adds_epu16(a: u16x16, b: u16x16) -> u16x16 {
unsafe { paddusw(a, b) }
pub unsafe fn _mm256_adds_epu16(a: u16x16, b: u16x16) -> u16x16 {
paddusw(a, b)
}
// TODO _mm256_alignr_epi8
@@ -100,7 +100,7 @@ pub fn _mm256_adds_epu16(a: u16x16, b: u16x16) -> u16x16 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vandps))]
pub fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
pub unsafe fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
a & b
}
@@ -109,7 +109,7 @@ pub fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vandnps))]
pub fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
pub unsafe fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
(!a) & b
}
@@ -117,16 +117,16 @@ pub fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpavgw))]
pub fn _mm256_avg_epu16 (a: u16x16, b: u16x16) -> u16x16 {
unsafe { pavgw(a, b) }
pub unsafe fn _mm256_avg_epu16 (a: u16x16, b: u16x16) -> u16x16 {
pavgw(a, b)
}
/// Average packed unsigned 8-bit integers in `a` and `b`.
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpavgb))]
pub fn _mm256_avg_epu8 (a: u8x32, b: u8x32) -> u8x32 {
unsafe { pavgb(a, b) }
pub unsafe fn _mm256_avg_epu8 (a: u8x32, b: u8x32) -> u8x32 {
pavgb(a, b)
}
// TODO _mm256_blend_epi16
@@ -137,8 +137,8 @@ pub fn _mm256_avg_epu8 (a: u8x32, b: u8x32) -> u8x32 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpblendvb))]
pub fn _mm256_blendv_epi8(a:i8x32,b:i8x32,mask:__m256i) -> i8x32 {
unsafe { pblendvb(a,b,mask) }
pub unsafe fn _mm256_blendv_epi8(a:i8x32,b:i8x32,mask:__m256i) -> i8x32 {
pblendvb(a,b,mask)
}
// TODO _mm_broadcastb_epi8
@@ -158,12 +158,11 @@ pub fn _mm256_blendv_epi8(a:i8x32,b:i8x32,mask:__m256i) -> i8x32 {
// TODO _mm256_bslli_epi128
// TODO _mm256_bsrli_epi128
/// Compare packed 64-bit integers in `a` and `b` for equality.
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpcmpeqq))]
pub fn _mm256_cmpeq_epi64(a: i64x4, b: i64x4) -> i64x4 {
pub unsafe fn _mm256_cmpeq_epi64(a: i64x4, b: i64x4) -> i64x4 {
a.eq(b)
}
@@ -171,7 +170,7 @@ pub fn _mm256_cmpeq_epi64(a: i64x4, b: i64x4) -> i64x4 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpcmpeqd))]
pub fn _mm256_cmpeq_epi32(a: i32x8, b: i32x8) -> i32x8 {
pub unsafe fn _mm256_cmpeq_epi32(a: i32x8, b: i32x8) -> i32x8 {
a.eq(b)
}
@@ -179,7 +178,7 @@ pub fn _mm256_cmpeq_epi32(a: i32x8, b: i32x8) -> i32x8 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpcmpeqw))]
pub fn _mm256_cmpeq_epi16(a: i16x16, b: i16x16) -> i16x16 {
pub unsafe fn _mm256_cmpeq_epi16(a: i16x16, b: i16x16) -> i16x16 {
a.eq(b)
}
@@ -187,7 +186,7 @@ pub fn _mm256_cmpeq_epi16(a: i16x16, b: i16x16) -> i16x16 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpcmpeqb))]
pub fn _mm256_cmpeq_epi8(a: i8x32, b: i8x32) -> i8x32 {
pub unsafe fn _mm256_cmpeq_epi8(a: i8x32, b: i8x32) -> i8x32 {
a.eq(b)
}
@@ -195,7 +194,7 @@ pub fn _mm256_cmpeq_epi8(a: i8x32, b: i8x32) -> i8x32 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpcmpgtq))]
pub fn _mm256_cmpgt_epi64(a: i64x4, b: i64x4) -> i64x4 {
pub unsafe fn _mm256_cmpgt_epi64(a: i64x4, b: i64x4) -> i64x4 {
a.gt(b)
}
@@ -203,7 +202,7 @@ pub fn _mm256_cmpgt_epi64(a: i64x4, b: i64x4) -> i64x4 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpcmpgtd))]
pub fn _mm256_cmpgt_epi32(a: i32x8, b: i32x8) -> i32x8 {
pub unsafe fn _mm256_cmpgt_epi32(a: i32x8, b: i32x8) -> i32x8 {
a.gt(b)
}
@@ -211,7 +210,7 @@ pub fn _mm256_cmpgt_epi32(a: i32x8, b: i32x8) -> i32x8 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpcmpgtw))]
pub fn _mm256_cmpgt_epi16(a: i16x16, b: i16x16) -> i16x16 {
pub unsafe fn _mm256_cmpgt_epi16(a: i16x16, b: i16x16) -> i16x16 {
a.gt(b)
}
@@ -219,7 +218,7 @@ pub fn _mm256_cmpgt_epi16(a: i16x16, b: i16x16) -> i16x16 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpcmpgtb))]
pub fn _mm256_cmpgt_epi8(a: i8x32, b: i8x32) -> i8x32 {
pub unsafe fn _mm256_cmpgt_epi8(a: i8x32, b: i8x32) -> i8x32 {
a.gt(b)
}
@@ -241,16 +240,16 @@ pub fn _mm256_cmpgt_epi8(a: i8x32, b: i8x32) -> i8x32 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vphaddw))]
pub fn _mm256_hadd_epi16(a: i16x16, b: i16x16) -> i16x16 {
unsafe { phaddw(a, b) }
pub unsafe fn _mm256_hadd_epi16(a: i16x16, b: i16x16) -> i16x16 {
phaddw(a, b)
}
/// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`.
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vphaddd))]
pub fn _mm256_hadd_epi32(a: i32x8, b: i32x8) -> i32x8 {
unsafe { phaddd(a, b) }
pub unsafe fn _mm256_hadd_epi32(a: i32x8, b: i32x8) -> i32x8 {
phaddd(a, b)
}
/// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`
@@ -258,24 +257,24 @@ pub fn _mm256_hadd_epi32(a: i32x8, b: i32x8) -> i32x8 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vphaddsw))]
pub fn _mm256_hadds_epi16(a: i16x16, b: i16x16) -> i16x16 {
unsafe { phaddsw(a, b) }
pub unsafe fn _mm256_hadds_epi16(a: i16x16, b: i16x16) -> i16x16 {
phaddsw(a, b)
}
/// Horizontally substract adjacent pairs of 16-bit integers in `a` and `b`.
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vphsubw))]
pub fn _mm256_hsub_epi16(a: i16x16, b: i16x16) -> i16x16 {
unsafe { phsubw(a, b) }
pub unsafe fn _mm256_hsub_epi16(a: i16x16, b: i16x16) -> i16x16 {
phsubw(a, b)
}
/// Horizontally substract adjacent pairs of 32-bit integers in `a` and `b`.
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vphsubd))]
pub fn _mm256_hsub_epi32(a: i32x8, b: i32x8) -> i32x8 {
unsafe { phsubd(a, b) }
pub unsafe fn _mm256_hsub_epi32(a: i32x8, b: i32x8) -> i32x8 {
phsubd(a, b)
}
/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`
@@ -283,8 +282,8 @@ pub fn _mm256_hsub_epi32(a: i32x8, b: i32x8) -> i32x8 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vphsubsw))]
pub fn _mm256_hsubs_epi16(a: i16x16, b: i16x16) -> i16x16 {
unsafe { phsubsw(a, b) }
pub unsafe fn _mm256_hsubs_epi16(a: i16x16, b: i16x16) -> i16x16 {
phsubsw(a, b)
}
@@ -328,8 +327,8 @@ pub fn _mm256_hsubs_epi16(a: i16x16, b: i16x16) -> i16x16 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpmaddwd))]
pub fn _mm256_madd_epi16(a: i16x16, b: i16x16) -> i32x8 {
unsafe { pmaddwd(a, b) }
pub unsafe fn _mm256_madd_epi16(a: i16x16, b: i16x16) -> i32x8 {
pmaddwd(a, b)
}
/// Vertically multiply each unsigned 8-bit integer from `a` with the
@@ -339,8 +338,8 @@ pub fn _mm256_madd_epi16(a: i16x16, b: i16x16) -> i32x8 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpmaddubsw))]
pub fn _mm256_maddubs_epi16(a: u8x32, b: u8x32) -> i16x16 {
unsafe { pmaddubsw(a, b) }
pub unsafe fn _mm256_maddubs_epi16(a: u8x32, b: u8x32) -> i16x16 {
pmaddubsw(a, b)
}
// TODO _mm_maskload_epi32 (int const* mem_addr, __m128i mask)
@@ -357,8 +356,8 @@ pub fn _mm256_maddubs_epi16(a: u8x32, b: u8x32) -> i16x16 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpmaxsw))]
pub fn _mm256_max_epi16(a: i16x16, b: i16x16) -> i16x16 {
unsafe { pmaxsw(a, b) }
pub unsafe fn _mm256_max_epi16(a: i16x16, b: i16x16) -> i16x16 {
pmaxsw(a, b)
}
/// Compare packed 32-bit integers in `a` and `b`, and return the packed
@@ -366,8 +365,8 @@ pub fn _mm256_max_epi16(a: i16x16, b: i16x16) -> i16x16 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpmaxsd))]
pub fn _mm256_max_epi32(a: i32x8, b: i32x8) -> i32x8 {
unsafe { pmaxsd(a, b) }
pub unsafe fn _mm256_max_epi32(a: i32x8, b: i32x8) -> i32x8 {
pmaxsd(a, b)
}
/// Compare packed 8-bit integers in `a` and `b`, and return the packed
@@ -375,8 +374,8 @@ pub fn _mm256_max_epi32(a: i32x8, b: i32x8) -> i32x8 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpmaxsb))]
pub fn _mm256_max_epi8(a: i8x32, b: i8x32) -> i8x32 {
unsafe { pmaxsb(a, b) }
pub unsafe fn _mm256_max_epi8(a: i8x32, b: i8x32) -> i8x32 {
pmaxsb(a, b)
}
/// Compare packed unsigned 16-bit integers in `a` and `b`, and return
@@ -384,8 +383,8 @@ pub fn _mm256_max_epi8(a: i8x32, b: i8x32) -> i8x32 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpmaxuw))]
pub fn _mm256_max_epu16(a: u16x16, b: u16x16) -> u16x16 {
unsafe { pmaxuw(a, b) }
pub unsafe fn _mm256_max_epu16(a: u16x16, b: u16x16) -> u16x16 {
pmaxuw(a, b)
}
/// Compare packed unsigned 32-bit integers in `a` and `b`, and return
@@ -393,8 +392,8 @@ pub fn _mm256_max_epu16(a: u16x16, b: u16x16) -> u16x16 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpmaxud))]
pub fn _mm256_max_epu32(a: u32x8, b: u32x8) -> u32x8 {
unsafe { pmaxud(a, b) }
pub unsafe fn _mm256_max_epu32(a: u32x8, b: u32x8) -> u32x8 {
pmaxud(a, b)
}
/// Compare packed unsigned 8-bit integers in `a` and `b`, and return
@@ -402,8 +401,8 @@ pub fn _mm256_max_epu32(a: u32x8, b: u32x8) -> u32x8 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpmaxub))]
pub fn _mm256_max_epu8(a: u8x32, b: u8x32) -> u8x32 {
unsafe { pmaxub(a, b) }
pub unsafe fn _mm256_max_epu8(a: u8x32, b: u8x32) -> u8x32 {
pmaxub(a, b)
}
/// Compare packed 16-bit integers in `a` and `b`, and return the packed
@@ -411,8 +410,8 @@ pub fn _mm256_max_epu8(a: u8x32, b: u8x32) -> u8x32 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpminsw))]
pub fn _mm256_min_epi16(a: i16x16, b: i16x16) -> i16x16 {
unsafe { pminsw(a, b) }
pub unsafe fn _mm256_min_epi16(a: i16x16, b: i16x16) -> i16x16 {
pminsw(a, b)
}
/// Compare packed 32-bit integers in `a` and `b`, and return the packed
@@ -420,8 +419,8 @@ pub fn _mm256_min_epi16(a: i16x16, b: i16x16) -> i16x16 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpminsd))]
pub fn _mm256_min_epi32(a: i32x8, b: i32x8) -> i32x8 {
unsafe { pminsd(a, b) }
pub unsafe fn _mm256_min_epi32(a: i32x8, b: i32x8) -> i32x8 {
pminsd(a, b)
}
/// Compare packed 8-bit integers in `a` and `b`, and return the packed
@@ -429,8 +428,8 @@ pub fn _mm256_min_epi32(a: i32x8, b: i32x8) -> i32x8 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpminsb))]
pub fn _mm256_min_epi8(a: i8x32, b: i8x32) -> i8x32 {
unsafe { pminsb(a, b) }
pub unsafe fn _mm256_min_epi8(a: i8x32, b: i8x32) -> i8x32 {
pminsb(a, b)
}
/// Compare packed unsigned 16-bit integers in `a` and `b`, and return
@@ -438,8 +437,8 @@ pub fn _mm256_min_epi8(a: i8x32, b: i8x32) -> i8x32 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpminuw))]
pub fn _mm256_min_epu16(a: u16x16, b: u16x16) -> u16x16 {
unsafe { pminuw(a, b) }
pub unsafe fn _mm256_min_epu16(a: u16x16, b: u16x16) -> u16x16 {
pminuw(a, b)
}
/// Compare packed unsigned 32-bit integers in `a` and `b`, and return
@@ -447,8 +446,8 @@ pub fn _mm256_min_epu16(a: u16x16, b: u16x16) -> u16x16 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpminud))]
pub fn _mm256_min_epu32(a: u32x8, b: u32x8) -> u32x8 {
unsafe { pminud(a, b) }
pub unsafe fn _mm256_min_epu32(a: u32x8, b: u32x8) -> u32x8 {
pminud(a, b)
}
/// Compare packed unsigned 8-bit integers in `a` and `b`, and return
@@ -456,8 +455,8 @@ pub fn _mm256_min_epu32(a: u32x8, b: u32x8) -> u32x8 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpminub))]
pub fn _mm256_min_epu8(a: u8x32, b: u8x32) -> u8x32 {
unsafe { pminub(a, b) }
pub unsafe fn _mm256_min_epu8(a: u8x32, b: u8x32) -> u8x32 {
pminub(a, b)
}
/*** The following two functions fail in debug, but work in release
@@ -492,8 +491,8 @@ pub fn _mm256_mpsadbw_epu8(a: u8x32, b: u8x32, imm8: i32) -> u16x16 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpmuldq))]
pub fn _mm256_mul_epi32(a: i32x8, b: i32x8) -> i64x4 {
unsafe { pmuldq(a, b) }
pub unsafe fn _mm256_mul_epi32(a: i32x8, b: i32x8) -> i64x4 {
pmuldq(a, b)
}
/// Multiply the low unsigned 32-bit integers from each packed 64-bit
@@ -503,8 +502,8 @@ pub fn _mm256_mul_epi32(a: i32x8, b: i32x8) -> i64x4 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpmuludq))]
pub fn _mm256_mul_epu32(a: u32x8, b: u32x8) -> u64x4 {
unsafe { pmuludq(a, b) }
pub unsafe fn _mm256_mul_epu32(a: u32x8, b: u32x8) -> u64x4 {
pmuludq(a, b)
}
/// Multiply the packed 16-bit integers in `a` and `b`, producing
@@ -513,8 +512,8 @@ pub fn _mm256_mul_epu32(a: u32x8, b: u32x8) -> u64x4 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpmulhw))]
pub fn _mm256_mulhi_epi16(a: i16x16, b: i16x16) -> i16x16 {
unsafe { pmulhw(a, b) }
pub unsafe fn _mm256_mulhi_epi16(a: i16x16, b: i16x16) -> i16x16 {
pmulhw(a, b)
}
/// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing
@@ -523,8 +522,8 @@ pub fn _mm256_mulhi_epi16(a: i16x16, b: i16x16) -> i16x16 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpmulhuw))]
pub fn _mm256_mulhi_epu16(a: u16x16, b: u16x16) -> u16x16 {
unsafe { pmulhuw(a, b) }
pub unsafe fn _mm256_mulhi_epu16(a: u16x16, b: u16x16) -> u16x16 {
pmulhuw(a, b)
}
/// Multiply the packed 16-bit integers in `a` and `b`, producing
@@ -533,7 +532,7 @@ pub fn _mm256_mulhi_epu16(a: u16x16, b: u16x16) -> u16x16 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpmullw))]
pub fn _mm256_mullo_epi16(a: i16x16, b:i16x16) -> i16x16 {
pub unsafe fn _mm256_mullo_epi16(a: i16x16, b:i16x16) -> i16x16 {
a * b
}
@@ -544,7 +543,7 @@ pub fn _mm256_mullo_epi16(a: i16x16, b:i16x16) -> i16x16 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpmulld))]
pub fn _mm256_mullo_epi32(a: i32x8, b:i32x8) -> i32x8 {
pub unsafe fn _mm256_mullo_epi32(a: i32x8, b:i32x8) -> i32x8 {
a * b
}
@@ -555,8 +554,8 @@ pub fn _mm256_mullo_epi32(a: i32x8, b:i32x8) -> i32x8 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpmulhrsw))]
pub fn _mm256_mulhrs_epi16(a: i16x16, b:i16x16) -> i16x16 {
unsafe { pmulhrsw(a, b) }
pub unsafe fn _mm256_mulhrs_epi16(a: i16x16, b:i16x16) -> i16x16 {
pmulhrsw(a, b)
}
/// Compute the bitwise OR of 256 bits (representing integer data) in `a`
@@ -564,7 +563,7 @@ pub fn _mm256_mulhrs_epi16(a: i16x16, b:i16x16) -> i16x16 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vorps))]
pub fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
pub unsafe fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
a | b
}
@@ -573,8 +572,8 @@ pub fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpacksswb))]
pub fn _mm256_packs_epi16(a: i16x16, b: i16x16) -> i8x32 {
unsafe { packsswb(a, b) }
pub unsafe fn _mm256_packs_epi16(a: i16x16, b: i16x16) -> i8x32 {
packsswb(a, b)
}
/// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers
@@ -582,8 +581,8 @@ pub fn _mm256_packs_epi16(a: i16x16, b: i16x16) -> i8x32 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpackssdw))]
pub fn _mm256_packs_epi32(a: i32x8, b: i32x8) -> i16x16 {
unsafe { packssdw(a, b) }
pub unsafe fn _mm256_packs_epi32(a: i32x8, b: i32x8) -> i16x16 {
packssdw(a, b)
}
/// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers
@@ -591,8 +590,8 @@ pub fn _mm256_packs_epi32(a: i32x8, b: i32x8) -> i16x16 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpackuswb))]
pub fn _mm256_packus_epi16(a: i16x16, b: i16x16) -> u8x32 {
unsafe { packuswb(a, b) }
pub unsafe fn _mm256_packus_epi16(a: i16x16, b: i16x16) -> u8x32 {
packuswb(a, b)
}
/// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers
@@ -600,8 +599,8 @@ pub fn _mm256_packus_epi16(a: i16x16, b: i16x16) -> u8x32 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpackusdw))]
pub fn _mm256_packus_epi32(a: i32x8, b: i32x8) -> u16x16 {
unsafe { packusdw(a, b) }
pub unsafe fn _mm256_packus_epi32(a: i32x8, b: i32x8) -> u16x16 {
packusdw(a, b)
}
// TODO _mm256_permute2x128_si256 (__m256i a, __m256i b, const int imm8)
@@ -617,8 +616,8 @@ pub fn _mm256_packus_epi32(a: i32x8, b: i32x8) -> u16x16 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpsadbw))]
pub fn _mm256_sad_epu8 (a: u8x32, b: u8x32) -> u64x4 {
unsafe { psadbw(a, b) }
pub unsafe fn _mm256_sad_epu8 (a: u8x32, b: u8x32) -> u64x4 {
psadbw(a, b)
}
// TODO _mm256_shuffle_epi32 (__m256i a, const int imm8)
@@ -632,8 +631,8 @@ pub fn _mm256_sad_epu8 (a: u8x32, b: u8x32) -> u64x4 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpsignw))]
pub fn _mm256_sign_epi16(a: i16x16, b: i16x16) -> i16x16 {
unsafe { psignw(a, b) }
pub unsafe fn _mm256_sign_epi16(a: i16x16, b: i16x16) -> i16x16 {
psignw(a, b)
}
/// Negate packed 32-bit integers in `a` when the corresponding signed
@@ -642,8 +641,8 @@ pub fn _mm256_sign_epi16(a: i16x16, b: i16x16) -> i16x16 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpsignd))]
pub fn _mm256_sign_epi32(a: i32x8, b: i32x8) -> i32x8 {
unsafe { psignd(a, b) }
pub unsafe fn _mm256_sign_epi32(a: i32x8, b: i32x8) -> i32x8 {
psignd(a, b)
}
/// Negate packed 8-bit integers in `a` when the corresponding signed
@@ -652,8 +651,8 @@ pub fn _mm256_sign_epi32(a: i32x8, b: i32x8) -> i32x8 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpsignb))]
pub fn _mm256_sign_epi8(a: i8x32, b: i8x32) -> i8x32 {
unsafe { psignb(a, b) }
pub unsafe fn _mm256_sign_epi8(a: i8x32, b: i8x32) -> i8x32 {
psignb(a, b)
}
/// Shift packed 16-bit integers in `a` left by `count` while
@@ -661,8 +660,8 @@ pub fn _mm256_sign_epi8(a: i8x32, b: i8x32) -> i8x32 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpsllw))]
pub fn _mm256_sll_epi16(a: i16x16, count: i16x8) -> i16x16 {
unsafe { psllw(a, count) }
pub unsafe fn _mm256_sll_epi16(a: i16x16, count: i16x8) -> i16x16 {
psllw(a, count)
}
/// Shift packed 32-bit integers in `a` left by `count` while
@@ -670,8 +669,8 @@ pub fn _mm256_sll_epi16(a: i16x16, count: i16x8) -> i16x16 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpslld))]
pub fn _mm256_sll_epi32(a: i32x8, count: i32x4) -> i32x8 {
unsafe { pslld(a, count) }
pub unsafe fn _mm256_sll_epi32(a: i32x8, count: i32x4) -> i32x8 {
pslld(a, count)
}
/// Shift packed 64-bit integers in `a` left by `count` while
@@ -679,35 +678,35 @@ pub fn _mm256_sll_epi32(a: i32x8, count: i32x4) -> i32x8 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpsllq))]
pub fn _mm256_sll_epi64(a: i64x4, count: i64x2) -> i64x4 {
unsafe { psllq(a, count) }
pub unsafe fn _mm256_sll_epi64(a: i64x4, count: i64x2) -> i64x4 {
psllq(a, count)
}
/// Shift packed 16-bit integers in `a` left by `imm8` while
/// shifting in zeros, return the results;
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpsllw))] // TODO: should this be pslli
pub fn _mm256_slli_epi16(a: i16x16, imm8: i32) -> i16x16 {
unsafe { pslliw(a, imm8) }
#[cfg_attr(test, assert_instr(vpsllw))]
pub unsafe fn _mm256_slli_epi16(a: i16x16, imm8: i32) -> i16x16 {
pslliw(a, imm8)
}
/// Shift packed 32-bit integers in `a` left by `imm8` while
/// shifting in zeros, return the results;
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpslld))] // TODO: should this be pslli
pub fn _mm256_slli_epi32(a: i32x8, imm8: i32) -> i32x8 {
unsafe { psllid(a, imm8) }
#[cfg_attr(test, assert_instr(vpslld))]
pub unsafe fn _mm256_slli_epi32(a: i32x8, imm8: i32) -> i32x8 {
psllid(a, imm8)
}
/// Shift packed 64-bit integers in `a` left by `imm8` while
/// shifting in zeros, return the results;
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpsllq))] // TODO: should this be pslli
pub fn _mm256_slli_epi64(a: i64x4, imm8: i32) -> i64x4 {
unsafe { pslliq(a, imm8) }
#[cfg_attr(test, assert_instr(vpsllq))]
pub unsafe fn _mm256_slli_epi64(a: i64x4, imm8: i32) -> i64x4 {
pslliq(a, imm8)
}
// TODO _mm256_slli_si256 (__m256i a, const int imm8)
@@ -718,8 +717,8 @@ pub fn _mm256_slli_epi64(a: i64x4, imm8: i32) -> i64x4 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpsllvd))]
pub fn _mm_sllv_epi32(a: i32x4, count: i32x4) -> i32x4 {
unsafe { psllvd(a, count) }
pub unsafe fn _mm_sllv_epi32(a: i32x4, count: i32x4) -> i32x4 {
psllvd(a, count)
}
/// Shift packed 32-bit integers in `a` left by the amount
@@ -728,8 +727,8 @@ pub fn _mm_sllv_epi32(a: i32x4, count: i32x4) -> i32x4 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpsllvd))]
pub fn _mm256_sllv_epi32(a: i32x8, count: i32x8) -> i32x8 {
unsafe { psllvd256(a, count) }
pub unsafe fn _mm256_sllv_epi32(a: i32x8, count: i32x8) -> i32x8 {
psllvd256(a, count)
}
/// Shift packed 64-bit integers in `a` left by the amount
@@ -738,8 +737,8 @@ pub fn _mm256_sllv_epi32(a: i32x8, count: i32x8) -> i32x8 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpsllvq))]
pub fn _mm_sllv_epi64(a: i64x2, count: i64x2) -> i64x2 {
unsafe { psllvq(a, count) }
pub unsafe fn _mm_sllv_epi64(a: i64x2, count: i64x2) -> i64x2 {
psllvq(a, count)
}
/// Shift packed 64-bit integers in `a` left by the amount
@@ -748,8 +747,8 @@ pub fn _mm_sllv_epi64(a: i64x2, count: i64x2) -> i64x2 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpsllvq))]
pub fn _mm256_sllv_epi64(a: i64x4, count: i64x4) -> i64x4 {
unsafe { psllvq256(a, count) }
pub unsafe fn _mm256_sllv_epi64(a: i64x4, count: i64x4) -> i64x4 {
psllvq256(a, count)
}
/// Shift packed 16-bit integers in `a` right by `count` while
@@ -757,8 +756,8 @@ pub fn _mm256_sllv_epi64(a: i64x4, count: i64x4) -> i64x4 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpsraw))]
pub fn _mm256_sra_epi16(a: i16x16, count: i16x8) -> i16x16 {
unsafe { psraw(a, count) }
pub unsafe fn _mm256_sra_epi16(a: i16x16, count: i16x8) -> i16x16 {
psraw(a, count)
}
/// Shift packed 32-bit integers in `a` right by `count` while
@@ -766,26 +765,26 @@ pub fn _mm256_sra_epi16(a: i16x16, count: i16x8) -> i16x16 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpsrad))]
pub fn _mm256_sra_epi32(a: i32x8, count: i32x4) -> i32x8 {
unsafe { psrad(a, count) }
pub unsafe fn _mm256_sra_epi32(a: i32x8, count: i32x4) -> i32x8 {
psrad(a, count)
}
/// Shift packed 16-bit integers in `a` right by `imm8` while
/// shifting in sign bits.
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpsraw))] // TODO: notvpsraiw?
pub fn _mm256_srai_epi16(a: i16x16, imm8: i32) -> i16x16 {
unsafe { psraiw(a, imm8) }
#[cfg_attr(test, assert_instr(vpsraw))]
pub unsafe fn _mm256_srai_epi16(a: i16x16, imm8: i32) -> i16x16 {
psraiw(a, imm8)
}
/// Shift packed 32-bit integers in `a` right by `imm8` while
/// shifting in sign bits.
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpsrad))] // TODO: not vpsraid?
pub fn _mm256_srai_epi32(a: i32x8, imm8: i32) -> i32x8 {
unsafe { psraid(a, imm8) }
#[cfg_attr(test, assert_instr(vpsrad))]
pub unsafe fn _mm256_srai_epi32(a: i32x8, imm8: i32) -> i32x8 {
psraid(a, imm8)
}
/// Shift packed 32-bit integers in `a` right by the amount specified by the
@@ -793,8 +792,8 @@ pub fn _mm256_srai_epi32(a: i32x8, imm8: i32) -> i32x8 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpsravd))]
pub fn _mm_srav_epi32(a: i32x4, count: i32x4) -> i32x4 {
unsafe { psravd(a, count) }
pub unsafe fn _mm_srav_epi32(a: i32x4, count: i32x4) -> i32x4 {
psravd(a, count)
}
/// Shift packed 32-bit integers in `a` right by the amount specified by the
@@ -802,8 +801,8 @@ pub fn _mm_srav_epi32(a: i32x4, count: i32x4) -> i32x4 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpsravd))]
pub fn _mm256_srav_epi32(a: i32x8, count: i32x8) -> i32x8 {
unsafe { psravd256(a, count) }
pub unsafe fn _mm256_srav_epi32(a: i32x8, count: i32x8) -> i32x8 {
psravd256(a, count)
}
@@ -812,8 +811,8 @@ pub fn _mm256_srav_epi32(a: i32x8, count: i32x8) -> i32x8 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpsrlw))]
pub fn _mm256_srl_epi16(a: i16x16, count: i16x8) -> i16x16 {
unsafe { psrlw(a, count) }
pub unsafe fn _mm256_srl_epi16(a: i16x16, count: i16x8) -> i16x16 {
psrlw(a, count)
}
/// Shift packed 32-bit integers in `a` right by `count` while shifting in
@@ -821,8 +820,8 @@ pub fn _mm256_srl_epi16(a: i16x16, count: i16x8) -> i16x16 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpsrld))]
pub fn _mm256_srl_epi32(a: i32x8, count: i32x4) -> i32x8 {
unsafe { psrld(a, count) }
pub unsafe fn _mm256_srl_epi32(a: i32x8, count: i32x4) -> i32x8 {
psrld(a, count)
}
/// Shift packed 64-bit integers in `a` right by `count` while shifting in
@@ -830,35 +829,35 @@ pub fn _mm256_srl_epi32(a: i32x8, count: i32x4) -> i32x8 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpsrlq))]
pub fn _mm256_srl_epi64(a: i64x4, count: i64x2) -> i64x4 {
unsafe { psrlq(a, count) }
pub unsafe fn _mm256_srl_epi64(a: i64x4, count: i64x2) -> i64x4 {
psrlq(a, count)
}
/// Shift packed 16-bit integers in `a` right by `imm8` while shifting in
/// zeros
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpsrlw))] // TODO not vpsrliw?
pub fn _mm256_srli_epi16(a: i16x16, imm8: i32) -> i16x16 {
unsafe { psrliw(a, imm8) }
#[cfg_attr(test, assert_instr(vpsrlw))]
pub unsafe fn _mm256_srli_epi16(a: i16x16, imm8: i32) -> i16x16 {
psrliw(a, imm8)
}
/// Shift packed 32-bit integers in `a` right by `imm8` while shifting in
/// zeros
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpsrld))] // TODO: not vpsrlid?
pub fn _mm256_srli_epi32(a: i32x8, imm8: i32) -> i32x8 {
unsafe { psrlid(a, imm8) }
#[cfg_attr(test, assert_instr(vpsrld))]
pub unsafe fn _mm256_srli_epi32(a: i32x8, imm8: i32) -> i32x8 {
psrlid(a, imm8)
}
/// Shift packed 64-bit integers in `a` right by `imm8` while shifting in
/// zeros
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpsrlq))] // TODO: not vpsrliq?
pub fn _mm256_srli_epi64(a: i64x4, imm8: i32) -> i64x4 {
unsafe { psrliq(a, imm8) }
#[cfg_attr(test, assert_instr(vpsrlq))]
pub unsafe fn _mm256_srli_epi64(a: i64x4, imm8: i32) -> i64x4 {
psrliq(a, imm8)
}
/// Shift packed 32-bit integers in `a` right by the amount specified by
@@ -866,8 +865,8 @@ pub fn _mm256_srli_epi64(a: i64x4, imm8: i32) -> i64x4 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpsrlvd))]
pub fn _mm_srlv_epi32(a: i32x4, count: i32x4) -> i32x4 {
unsafe { psrlvd(a, count) }
pub unsafe fn _mm_srlv_epi32(a: i32x4, count: i32x4) -> i32x4 {
psrlvd(a, count)
}
/// Shift packed 32-bit integers in `a` right by the amount specified by
@@ -875,8 +874,8 @@ pub fn _mm_srlv_epi32(a: i32x4, count: i32x4) -> i32x4 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpsrlvd))]
pub fn _mm256_srlv_epi32(a: i32x8, count: i32x8) -> i32x8 {
unsafe { psrlvd256(a, count) }
pub unsafe fn _mm256_srlv_epi32(a: i32x8, count: i32x8) -> i32x8 {
psrlvd256(a, count)
}
/// Shift packed 64-bit integers in `a` right by the amount specified by
@@ -884,8 +883,8 @@ pub fn _mm256_srlv_epi32(a: i32x8, count: i32x8) -> i32x8 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpsrlvq))]
pub fn _mm_srlv_epi64(a: i64x2, count: i64x2) -> i64x2 {
unsafe { psrlvq(a, count) }
pub unsafe fn _mm_srlv_epi64(a: i64x2, count: i64x2) -> i64x2 {
psrlvq(a, count)
}
/// Shift packed 64-bit integers in `a` right by the amount specified by
@@ -893,8 +892,8 @@ pub fn _mm_srlv_epi64(a: i64x2, count: i64x2) -> i64x2 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpsrlvq))]
pub fn _mm256_srlv_epi64(a: i64x4, count: i64x4) -> i64x4 {
unsafe { psrlvq256(a, count) }
pub unsafe fn _mm256_srlv_epi64(a: i64x4, count: i64x4) -> i64x4 {
psrlvq256(a, count)
}
// TODO _mm256_stream_load_si256 (__m256i const* mem_addr)
@@ -903,7 +902,7 @@ pub fn _mm256_srlv_epi64(a: i64x4, count: i64x4) -> i64x4 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpsubw))]
pub fn _mm256_sub_epi16(a: i16x16, b: i16x16) -> i16x16 {
pub unsafe fn _mm256_sub_epi16(a: i16x16, b: i16x16) -> i16x16 {
a - b
}
@@ -911,7 +910,7 @@ pub fn _mm256_sub_epi16(a: i16x16, b: i16x16) -> i16x16 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpsubd))]
pub fn _mm256_sub_epi32(a: i32x8, b: i32x8) -> i32x8 {
pub unsafe fn _mm256_sub_epi32(a: i32x8, b: i32x8) -> i32x8 {
a - b
}
@@ -919,7 +918,7 @@ pub fn _mm256_sub_epi32(a: i32x8, b: i32x8) -> i32x8 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpsubq))]
pub fn _mm256_sub_epi64(a: i64x4, b: i64x4) -> i64x4 {
pub unsafe fn _mm256_sub_epi64(a: i64x4, b: i64x4) -> i64x4 {
a - b
}
@@ -927,7 +926,7 @@ pub fn _mm256_sub_epi64(a: i64x4, b: i64x4) -> i64x4 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpsubb))]
pub fn _mm256_sub_epi8(a: i8x32, b: i8x32) -> i8x32 {
pub unsafe fn _mm256_sub_epi8(a: i8x32, b: i8x32) -> i8x32 {
a - b
}
@@ -936,8 +935,8 @@ pub fn _mm256_sub_epi8(a: i8x32, b: i8x32) -> i8x32 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpsubsw))]
pub fn _mm256_subs_epi16(a: i16x16, b: i16x16) -> i16x16 {
unsafe { psubsw(a, b) }
pub unsafe fn _mm256_subs_epi16(a: i16x16, b: i16x16) -> i16x16 {
psubsw(a, b)
}
/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in
@@ -945,8 +944,8 @@ pub fn _mm256_subs_epi16(a: i16x16, b: i16x16) -> i16x16 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpsubsb))]
pub fn _mm256_subs_epi8(a: i8x32, b: i8x32) -> i8x32 {
unsafe { psubsb(a, b) }
pub unsafe fn _mm256_subs_epi8(a: i8x32, b: i8x32) -> i8x32 {
psubsb(a, b)
}
/// Subtract packed unsigned 16-bit integers in `b` from packed 16-bit
@@ -954,8 +953,8 @@ pub fn _mm256_subs_epi8(a: i8x32, b: i8x32) -> i8x32 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpsubusw))]
pub fn _mm256_subs_epu16(a: u16x16, b: u16x16) -> u16x16 {
unsafe { psubusw(a, b) }
pub unsafe fn _mm256_subs_epu16(a: u16x16, b: u16x16) -> u16x16 {
psubusw(a, b)
}
/// Subtract packed unsigned 8-bit integers in `b` from packed 8-bit
@@ -963,8 +962,8 @@ pub fn _mm256_subs_epu16(a: u16x16, b: u16x16) -> u16x16 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vpsubusb))]
pub fn _mm256_subs_epu8(a: u8x32, b: u8x32) -> u8x32 {
unsafe { psubusb(a, b) }
pub unsafe fn _mm256_subs_epu8(a: u8x32, b: u8x32) -> u8x32 {
psubusb(a, b)
}
// TODO __mm256_unpackhi_epi16 (__m256i a, __m256i b)
@@ -981,11 +980,10 @@ pub fn _mm256_subs_epu8(a: u8x32, b: u8x32) -> u8x32 {
#[inline(always)]
#[target_feature = "+avx2"]
#[cfg_attr(test, assert_instr(vxorps))]
pub fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
pub unsafe fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
a ^ b
}
#[allow(improper_ctypes)]
extern "C" {
#[link_name = "llvm.x86.avx2.pabs.b"]
@@ -1048,9 +1046,9 @@ pub fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
fn pminud(a: u32x8, b: u32x8) -> u32x8;
#[link_name = "llvm.x86.avx2.pminu.b"]
fn pminub(a: u8x32, b: u8x32) -> u8x32;
#[link_name = "llvm.x86.avx2.pmovmskb"] //fails in debug
#[link_name = "llvm.x86.avx2.pmovmskb"]
fn pmovmskb(a: i8x32) -> i32;
#[link_name = "llvm.x86.avx2.mpsadbw"] //fails in debug
#[link_name = "llvm.x86.avx2.mpsadbw"]
fn mpsadbw(a: u8x32, b: u8x32, imm8: i32) -> u16x16;
#[link_name = "llvm.x86.avx2.pmulhu.w"]
fn pmulhuw(a: u16x16, b: u16x16) -> u16x16;
@@ -1141,7 +1139,6 @@ pub fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
}
#[cfg(test)]
mod tests {
use stdsimd_test::simd_test;
@@ -1157,7 +1154,7 @@ fn _mm256_abs_epi32() {
let a = i32x8::new(
0, 1, -1, std::i32::MAX,
std::i32::MIN + 1, 100, -100, -32);
let r = avx2::_mm256_abs_epi32(a);
let r = unsafe { avx2::_mm256_abs_epi32(a) };
let e = i32x8::new(
0, 1, 1, std::i32::MAX,
(std::i32::MIN + 1).abs(), 100, 100, 32);
@@ -1171,7 +1168,7 @@ fn _mm256_abs_epi16() {
-2, 3, -3, 4,
-4, 5, -5, std::i16::MAX,
std::i16::MIN + 1, 100, -100, -32);
let r = avx2::_mm256_abs_epi16(a);
let r = unsafe { avx2::_mm256_abs_epi16(a) };
let e = i16x16::new(
0, 1, 1, 2,
2, 3, 3, 4,
@@ -1191,7 +1188,7 @@ fn _mm256_abs_epi8() {
-2, 3, -3, 4,
-4, 5, -5, std::i8::MAX,
std::i8::MIN + 1, 100, -100, -32);
let r = avx2::_mm256_abs_epi8(a);
let r = unsafe { avx2::_mm256_abs_epi8(a) };
let e = i8x32::new(
0, 1, 1, 2, 2, 3, 3, 4,
4, 5, 5, std::i8::MAX, (std::i8::MIN + 1).abs(), 100, 100, 32,
@@ -1204,7 +1201,7 @@ fn _mm256_abs_epi8() {
fn _mm256_add_epi64() {
let a = i64x4::new(-10, 0, 100, 1_000_000_000);
let b = i64x4::new(-1, 0, 1, 2);
let r = avx2::_mm256_add_epi64(a, b);
let r = unsafe { avx2::_mm256_add_epi64(a, b) };
let e = i64x4::new(-11, 0, 101, 1_000_000_002);
assert_eq!(r, e);
}
@@ -1213,7 +1210,7 @@ fn _mm256_add_epi64() {
fn _mm256_add_epi32() {
let a = i32x8::new(-1, 0, 1, 2, 3, 4, 5, 6);
let b = i32x8::new(1, 2, 3, 4, 5, 6, 7, 8);
let r = avx2::_mm256_add_epi32(a, b);
let r = unsafe { avx2::_mm256_add_epi32(a, b) };
let e = i32x8::new(0, 2, 4, 6, 8, 10, 12, 14);
assert_eq!(r, e);
}
@@ -1226,7 +1223,7 @@ fn _mm256_add_epi16() {
let b = i16x16::new(
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15);
let r = avx2::_mm256_add_epi16(a, b);
let r = unsafe { avx2::_mm256_add_epi16(a, b) };
let e = i16x16::new(
0, 2, 4, 6, 8, 10, 12, 14,
16, 18, 20, 22, 24, 26, 28, 30);
@@ -1245,7 +1242,7 @@ fn _mm256_add_epi8() {
8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31);
let r = avx2::_mm256_add_epi8(a, b);
let r = unsafe { avx2::_mm256_add_epi8(a, b) };
let e = i8x32::new(
0, 2, 4, 6, 8, 10, 12, 14, 16,
18, 20, 22, 24, 26, 28, 30, 32,
@@ -1262,7 +1259,7 @@ fn _mm256_adds_epi8() {
let b = i8x32::new(
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
let r = avx2::_mm256_adds_epi8(a, b);
let r = unsafe { avx2::_mm256_adds_epi8(a, b) };
let e = i8x32::new(
32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94);
@@ -1273,7 +1270,7 @@ fn _mm256_adds_epi8() {
fn _mm256_adds_epi8_saturate_positive() {
let a = i8x32::splat(0x7F);
let b = i8x32::splat(1);
let r = avx2::_mm256_adds_epi8(a, b);
let r = unsafe { avx2::_mm256_adds_epi8(a, b) };
assert_eq!(r, a);
}
@@ -1281,7 +1278,7 @@ fn _mm256_adds_epi8_saturate_positive() {
fn _mm256_adds_epi8_saturate_negative() {
let a = i8x32::splat(-0x80);
let b = i8x32::splat(-1);
let r = avx2::_mm256_adds_epi8(a, b);
let r = unsafe { avx2::_mm256_adds_epi8(a, b) };
assert_eq!(r, a);
}
@@ -1291,7 +1288,7 @@ fn _mm256_adds_epi16() {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
let b = i16x16::new(
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47);
let r = avx2::_mm256_adds_epi16(a, b);
let r = unsafe { avx2::_mm256_adds_epi16(a, b) };
let e = i16x16::new(
32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62);
@@ -1302,7 +1299,7 @@ fn _mm256_adds_epi16() {
fn _mm256_adds_epi16_saturate_positive() {
let a = i16x16::splat(0x7FFF);
let b = i16x16::splat(1);
let r = avx2::_mm256_adds_epi16(a, b);
let r = unsafe { avx2::_mm256_adds_epi16(a, b) };
assert_eq!(r, a);
}
@@ -1310,7 +1307,7 @@ fn _mm256_adds_epi16_saturate_positive() {
fn _mm256_adds_epi16_saturate_negative() {
let a = i16x16::splat(-0x8000);
let b = i16x16::splat(-1);
let r = avx2::_mm256_adds_epi16(a, b);
let r = unsafe { avx2::_mm256_adds_epi16(a, b) };
assert_eq!(r, a);
}
@@ -1322,7 +1319,7 @@ fn _mm256_adds_epu8() {
let b = u8x32::new(
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
let r = avx2::_mm256_adds_epu8(a, b);
let r = unsafe { avx2::_mm256_adds_epu8(a, b) };
let e = u8x32::new(
32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94);
@@ -1333,7 +1330,7 @@ fn _mm256_adds_epu8() {
fn _mm256_adds_epu8_saturate() {
let a = u8x32::splat(0xFF);
let b = u8x32::splat(1);
let r = avx2::_mm256_adds_epu8(a, b);
let r = unsafe { avx2::_mm256_adds_epu8(a, b) };
assert_eq!(r, a);
}
@@ -1344,7 +1341,7 @@ fn _mm256_adds_epu16() {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
let b = u16x16::new(
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47);
let r = avx2::_mm256_adds_epu16(a, b);
let r = unsafe { avx2::_mm256_adds_epu16(a, b) };
let e = u16x16::new(
32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62);
@@ -1355,35 +1352,37 @@ fn _mm256_adds_epu16() {
fn _mm256_adds_epu16_saturate() {
let a = u16x16::splat(0xFFFF);
let b = u16x16::splat(1);
let r = avx2::_mm256_adds_epu16(a, b);
let r = unsafe { avx2::_mm256_adds_epu16(a, b) };
assert_eq!(r, a);
}
#[simd_test = "avx2"]
fn _mm256_and_si256() {
assert_eq!(
avx2::_mm256_and_si256(
__m256i::splat(5), __m256i::splat(3)),__m256i::splat(1));
let got = unsafe {
avx2::_mm256_and_si256(__m256i::splat(5), __m256i::splat(3))
};
assert_eq!(got, __m256i::splat(1));
}
#[simd_test = "avx2"]
fn _mm256_andnot_si256() {
assert_eq!(
avx2::_mm256_andnot_si256(__m256i::splat(5), __m256i::splat(3)),
__m256i::splat(2));
let got = unsafe {
avx2::_mm256_andnot_si256(__m256i::splat(5), __m256i::splat(3))
};
assert_eq!(got, __m256i::splat(2));
}
#[simd_test = "avx2"]
fn _mm256_avg_epu8() {
let (a, b) = (u8x32::splat(3), u8x32::splat(9));
let r = avx2::_mm256_avg_epu8(a, b);
let r = unsafe { avx2::_mm256_avg_epu8(a, b) };
assert_eq!(r, u8x32::splat(6));
}
#[simd_test = "avx2"]
fn _mm256_avg_epu16() {
let (a, b) = (u16x16::splat(3), u16x16::splat(9));
let r = avx2::_mm256_avg_epu16(a, b);
let r = unsafe { avx2::_mm256_avg_epu16(a, b) };
assert_eq!(r, u16x16::splat(6));
}
@@ -1392,7 +1391,7 @@ fn _mm256_blendv_epi8() {
let (a,b) = (i8x32::splat(4),i8x32::splat(2));
let mask = i8x32::splat(0).replace(2,-1);
let e = i8x32::splat(4).replace(2,2);
let r= avx2::_mm256_blendv_epi8(a,b,mask);
let r= unsafe { avx2::_mm256_blendv_epi8(a,b,mask) };
assert_eq!(r,e);
}
@@ -1404,7 +1403,7 @@ fn _mm256_cmpeq_epi8() {
let b = i8x32::new(
31, 30, 2, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
let r = avx2::_mm256_cmpeq_epi8(a, b);
let r = unsafe { avx2::_mm256_cmpeq_epi8(a, b) };
assert_eq!(r, i8x32::splat(0).replace(2,0xFFu8 as i8));
}
@@ -1414,7 +1413,7 @@ fn _mm256_cmpeq_epi16() {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
let b = i16x16::new(
15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
let r = avx2::_mm256_cmpeq_epi16(a, b);
let r = unsafe { avx2::_mm256_cmpeq_epi16(a, b) };
assert_eq!(r, i16x16::splat(0).replace(2, 0xFFFFu16 as i16));
}
@@ -1422,7 +1421,7 @@ fn _mm256_cmpeq_epi16() {
fn _mm256_cmpeq_epi32() {
let a = i32x8::new(0, 1, 2, 3,4,5,6,7);
let b = i32x8::new(7,6,2,4,3, 2, 1, 0);
let r = avx2::_mm256_cmpeq_epi32(a, b);
let r = unsafe { avx2::_mm256_cmpeq_epi32(a, b) };
assert_eq!(r, i32x8::splat(0).replace(2, 0xFFFFFFFFu32 as i32));
}
@@ -1430,7 +1429,7 @@ fn _mm256_cmpeq_epi32() {
fn _mm256_cmpeq_epi64() {
let a = i64x4::new(0, 1, 2, 3);
let b = i64x4::new(3, 2, 2, 0);
let r = avx2::_mm256_cmpeq_epi64(a, b);
let r = unsafe { avx2::_mm256_cmpeq_epi64(a, b) };
assert_eq!(r, i64x4::splat(0).replace(
2, 0xFFFFFFFFFFFFFFFFu64 as i64));
}
@@ -1439,7 +1438,7 @@ fn _mm256_cmpeq_epi64() {
fn _mm256_cmpgt_epi8() {
let a = i8x32::splat(0).replace(0, 5);
let b = i8x32::splat(0);
let r = avx2::_mm256_cmpgt_epi8(a, b);
let r = unsafe { avx2::_mm256_cmpgt_epi8(a, b) };
assert_eq!(r, i8x32::splat(0).replace(0, 0xFFu8 as i8));
}
@@ -1447,7 +1446,7 @@ fn _mm256_cmpgt_epi8() {
fn _mm256_cmpgt_epi16() {
let a = i16x16::splat(0).replace(0, 5);
let b = i16x16::splat(0);
let r = avx2::_mm256_cmpgt_epi16(a, b);
let r = unsafe { avx2::_mm256_cmpgt_epi16(a, b) };
assert_eq!(r, i16x16::splat(0).replace(0, 0xFFFFu16 as i16));
}
@@ -1455,7 +1454,7 @@ fn _mm256_cmpgt_epi16() {
fn _mm256_cmpgt_epi32() {
let a = i32x8::splat(0).replace(0, 5);
let b = i32x8::splat(0);
let r = avx2::_mm256_cmpgt_epi32(a, b);
let r = unsafe { avx2::_mm256_cmpgt_epi32(a, b) };
assert_eq!(r, i32x8::splat(0).replace(0, 0xFFFFFFFFu32 as i32));
}
@@ -1463,7 +1462,7 @@ fn _mm256_cmpgt_epi32() {
fn _mm256_cmpgt_epi64() {
let a = i64x4::splat(0).replace(0, 5);
let b = i64x4::splat(0);
let r = avx2::_mm256_cmpgt_epi64(a, b);
let r = unsafe { avx2::_mm256_cmpgt_epi64(a, b) };
assert_eq!(r, i64x4::splat(0).replace(
0, 0xFFFFFFFFFFFFFFFFu64 as i64));
}
@@ -1472,7 +1471,7 @@ fn _mm256_cmpgt_epi64() {
fn _mm256_hadd_epi16() {
let a = i16x16::splat(2);
let b = i16x16::splat(4);
let r = avx2::_mm256_hadd_epi16(a, b);
let r = unsafe { avx2::_mm256_hadd_epi16(a, b) };
let e = i16x16::new(4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8);
assert_eq!(r, e);
}
@@ -1481,7 +1480,7 @@ fn _mm256_hadd_epi16() {
fn _mm256_hadd_epi32() {
let a = i32x8::splat(2);
let b = i32x8::splat(4);
let r = avx2::_mm256_hadd_epi32(a, b);
let r = unsafe { avx2::_mm256_hadd_epi32(a, b) };
let e = i32x8::new(4, 4, 8, 8, 4, 4, 8, 8);
assert_eq!(r, e);
}
@@ -1490,7 +1489,7 @@ fn _mm256_hadd_epi32() {
fn _mm256_hadds_epi16() {
let a = i16x16::splat(2).replace(0,0x7FFF).replace(1,1);
let b = i16x16::splat(4);
let r = avx2::_mm256_hadds_epi16(a, b);
let r = unsafe { avx2::_mm256_hadds_epi16(a, b) };
let e = i16x16::new(
0x7FFF, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8);
assert_eq!(r, e);
@@ -1500,7 +1499,7 @@ fn _mm256_hadds_epi16() {
fn _mm256_hsub_epi16() {
let a = i16x16::splat(2);
let b = i16x16::splat(4);
let r = avx2::_mm256_hsub_epi16(a, b);
let r = unsafe { avx2::_mm256_hsub_epi16(a, b) };
let e = i16x16::splat(0);
assert_eq!(r, e);
}
@@ -1509,7 +1508,7 @@ fn _mm256_hsub_epi16() {
fn _mm256_hsub_epi32() {
let a = i32x8::splat(2);
let b = i32x8::splat(4);
let r = avx2::_mm256_hsub_epi32(a, b);
let r = unsafe { avx2::_mm256_hsub_epi32(a, b) };
let e = i32x8::splat(0);
assert_eq!(r, e);
}
@@ -1518,7 +1517,7 @@ fn _mm256_hsub_epi32() {
fn _mm256_hsubs_epi16() {
let a = i16x16::splat(2).replace(0,0x7FFF).replace(1,-1);
let b = i16x16::splat(4);
let r = avx2::_mm256_hsubs_epi16(a, b);
let r = unsafe { avx2::_mm256_hsubs_epi16(a, b) };
let e = i16x16::splat(0).replace(0,0x7FFF);
assert_eq!(r, e);
}
@@ -1527,7 +1526,7 @@ fn _mm256_hsubs_epi16() {
fn _mm256_madd_epi16() {
let a = i16x16::splat(2);
let b = i16x16::splat(4);
let r = avx2::_mm256_madd_epi16(a, b);
let r = unsafe { avx2::_mm256_madd_epi16(a, b) };
let e = i32x8::splat(16);
assert_eq!(r, e);
}
@@ -1536,7 +1535,7 @@ fn _mm256_madd_epi16() {
fn _mm256_maddubs_epi16() {
let a = u8x32::splat(2);
let b = u8x32::splat(4);
let r = avx2::_mm256_maddubs_epi16(a, b);
let r = unsafe { avx2::_mm256_maddubs_epi16(a, b) };
let e = i16x16::splat(16);
assert_eq!(r, e);
}
@@ -1545,7 +1544,7 @@ fn _mm256_maddubs_epi16() {
fn _mm256_max_epi16() {
let a = i16x16::splat(2);
let b = i16x16::splat(4);
let r = avx2::_mm256_max_epi16(a, b);
let r = unsafe { avx2::_mm256_max_epi16(a, b) };
assert_eq!(r, b);
}
@@ -1553,7 +1552,7 @@ fn _mm256_max_epi16() {
fn _mm256_max_epi32() {
let a = i32x8::splat(2);
let b = i32x8::splat(4);
let r = avx2::_mm256_max_epi32(a, b);
let r = unsafe { avx2::_mm256_max_epi32(a, b) };
assert_eq!(r, b);
}
@@ -1561,7 +1560,7 @@ fn _mm256_max_epi32() {
fn _mm256_max_epi8() {
let a = i8x32::splat(2);
let b = i8x32::splat(4);
let r = avx2::_mm256_max_epi8(a, b);
let r = unsafe { avx2::_mm256_max_epi8(a, b) };
assert_eq!(r, b);
}
@@ -1569,7 +1568,7 @@ fn _mm256_max_epi8() {
fn _mm256_max_epu16() {
let a = u16x16::splat(2);
let b = u16x16::splat(4);
let r = avx2::_mm256_max_epu16(a, b);
let r = unsafe { avx2::_mm256_max_epu16(a, b) };
assert_eq!(r, b);
}
@@ -1577,7 +1576,7 @@ fn _mm256_max_epu16() {
fn _mm256_max_epu32() {
let a = u32x8::splat(2);
let b = u32x8::splat(4);
let r = avx2::_mm256_max_epu32(a, b);
let r = unsafe { avx2::_mm256_max_epu32(a, b) };
assert_eq!(r, b);
}
@@ -1585,7 +1584,7 @@ fn _mm256_max_epu32() {
fn _mm256_max_epu8() {
let a = u8x32::splat(2);
let b = u8x32::splat(4);
let r = avx2::_mm256_max_epu8(a, b);
let r = unsafe { avx2::_mm256_max_epu8(a, b) };
assert_eq!(r, b);
}
@@ -1593,7 +1592,7 @@ fn _mm256_max_epu8() {
fn _mm256_min_epi16() {
let a = i16x16::splat(2);
let b = i16x16::splat(4);
let r = avx2::_mm256_min_epi16(a, b);
let r = unsafe { avx2::_mm256_min_epi16(a, b) };
assert_eq!(r, a);
}
@@ -1601,7 +1600,7 @@ fn _mm256_min_epi16() {
fn _mm256_min_epi32() {
let a = i32x8::splat(2);
let b = i32x8::splat(4);
let r = avx2::_mm256_min_epi32(a, b);
let r = unsafe { avx2::_mm256_min_epi32(a, b) };
assert_eq!(r, a);
}
@@ -1609,7 +1608,7 @@ fn _mm256_min_epi32() {
fn _mm256_min_epi8() {
let a = i8x32::splat(2);
let b = i8x32::splat(4);
let r = avx2::_mm256_min_epi8(a, b);
let r = unsafe { avx2::_mm256_min_epi8(a, b) };
assert_eq!(r, a);
}
@@ -1617,7 +1616,7 @@ fn _mm256_min_epi8() {
fn _mm256_min_epu16() {
let a = u16x16::splat(2);
let b = u16x16::splat(4);
let r = avx2::_mm256_min_epu16(a, b);
let r = unsafe { avx2::_mm256_min_epu16(a, b) };
assert_eq!(r, a);
}
@@ -1625,7 +1624,7 @@ fn _mm256_min_epu16() {
fn _mm256_min_epu32() {
let a = u32x8::splat(2);
let b = u32x8::splat(4);
let r = avx2::_mm256_min_epu32(a, b);
let r = unsafe { avx2::_mm256_min_epu32(a, b) };
assert_eq!(r, a);
}
@@ -1633,7 +1632,7 @@ fn _mm256_min_epu32() {
fn _mm256_min_epu8() {
let a = u8x32::splat(2);
let b = u8x32::splat(4);
let r = avx2::_mm256_min_epu8(a, b);
let r = unsafe { avx2::_mm256_min_epu8(a, b) };
assert_eq!(r, a);
}
@@ -1665,7 +1664,7 @@ fn _mm256_mpsadbw_epu8() {
fn _mm256_mul_epi32() {
let a = i32x8::new(0, 0, 0, 0, 2, 2, 2, 2);
let b = i32x8::new(1, 2, 3, 4, 5, 6, 7, 8);
let r = avx2::_mm256_mul_epi32(a, b);
let r = unsafe { avx2::_mm256_mul_epi32(a, b) };
let e = i64x4::new(0, 0, 10, 14);
assert_eq!(r, e);
}
@@ -1674,7 +1673,7 @@ fn _mm256_mul_epi32() {
fn _mm256_mul_epu32() {
let a = u32x8::new(0, 0, 0, 0, 2, 2, 2, 2);
let b = u32x8::new(1, 2, 3, 4, 5, 6, 7, 8);
let r = avx2::_mm256_mul_epu32(a, b);
let r = unsafe { avx2::_mm256_mul_epu32(a, b) };
let e = u64x4::new(0, 0, 10, 14);
assert_eq!(r, e);
}
@@ -1683,7 +1682,7 @@ fn _mm256_mul_epu32() {
fn _mm256_mulhi_epi16() {
let a = i16x16::splat(6535);
let b = i16x16::splat(6535);
let r = avx2::_mm256_mulhi_epi16(a, b);
let r = unsafe { avx2::_mm256_mulhi_epi16(a, b) };
let e = i16x16::splat(651);
assert_eq!(r, e);
}
@@ -1692,7 +1691,7 @@ fn _mm256_mulhi_epi16() {
fn _mm256_mulhi_epu16() {
let a = u16x16::splat(6535);
let b = u16x16::splat(6535);
let r = avx2::_mm256_mulhi_epu16(a, b);
let r = unsafe { avx2::_mm256_mulhi_epu16(a, b) };
let e = u16x16::splat(651);
assert_eq!(r, e);
}
@@ -1701,7 +1700,7 @@ fn _mm256_mulhi_epu16() {
fn _mm256_mullo_epi16() {
let a = i16x16::splat(2);
let b = i16x16::splat(4);
let r = avx2::_mm256_mullo_epi16(a, b);
let r = unsafe { avx2::_mm256_mullo_epi16(a, b) };
let e = i16x16::splat(8);
assert_eq!(r, e);
}
@@ -1710,7 +1709,7 @@ fn _mm256_mullo_epi16() {
fn _mm256_mullo_epi32() {
let a = i32x8::splat(2);
let b = i32x8::splat(4);
let r = avx2::_mm256_mullo_epi32(a, b);
let r = unsafe { avx2::_mm256_mullo_epi32(a, b) };
let e = i32x8::splat(8);
assert_eq!(r, e);
}
@@ -1719,7 +1718,7 @@ fn _mm256_mullo_epi32() {
fn _mm256_mulhrs_epi16() {
let a = i16x16::splat(2);
let b = i16x16::splat(4);
let r = avx2::_mm256_mullo_epi16(a, b);
let r = unsafe { avx2::_mm256_mullo_epi16(a, b) };
let e = i16x16::splat(8);
assert_eq!(r, e);
}
@@ -1728,7 +1727,7 @@ fn _mm256_mulhrs_epi16() {
fn _mm256_or_si256() {
let a = __m256i::splat(-1);
let b = __m256i::splat(0);
let r = avx2::_mm256_or_si256(a, b);
let r = unsafe { avx2::_mm256_or_si256(a, b) };
assert_eq!(r, a);
}
@@ -1736,7 +1735,7 @@ fn _mm256_or_si256() {
fn _mm256_packs_epi16() {
let a = i16x16::splat(2);
let b = i16x16::splat(4);
let r = avx2::_mm256_packs_epi16(a, b);
let r = unsafe { avx2::_mm256_packs_epi16(a, b) };
let e = i8x32::new(
2, 2, 2, 2, 2, 2, 2, 2,
4, 4, 4, 4, 4, 4, 4, 4,
@@ -1750,7 +1749,7 @@ fn _mm256_packs_epi16() {
fn _mm256_packs_epi32() {
let a = i32x8::splat(2);
let b = i32x8::splat(4);
let r = avx2::_mm256_packs_epi32(a, b);
let r = unsafe { avx2::_mm256_packs_epi32(a, b) };
let e = i16x16::new(
2, 2, 2, 2,
4, 4, 4, 4,
@@ -1764,7 +1763,7 @@ fn _mm256_packs_epi32() {
fn _mm256_packus_epi16() {
let a = i16x16::splat(2);
let b = i16x16::splat(4);
let r = avx2::_mm256_packus_epi16(a, b);
let r = unsafe { avx2::_mm256_packus_epi16(a, b) };
let e = u8x32::new(
2, 2, 2, 2, 2, 2, 2, 2,
4, 4, 4, 4, 4, 4, 4, 4,
@@ -1778,7 +1777,7 @@ fn _mm256_packus_epi16() {
fn _mm256_packus_epi32() {
let a = i32x8::splat(2);
let b = i32x8::splat(4);
let r = avx2::_mm256_packus_epi32(a, b);
let r = unsafe { avx2::_mm256_packus_epi32(a, b) };
let e = u16x16::new(
2, 2, 2, 2,
4, 4, 4, 4,
@@ -1792,7 +1791,7 @@ fn _mm256_packus_epi32() {
fn _mm256_sad_epu8() {
let a = u8x32::splat(2);
let b = u8x32::splat(4);
let r = avx2::_mm256_sad_epu8(a, b);
let r = unsafe { avx2::_mm256_sad_epu8(a, b) };
let e = u64x4::splat(16);
assert_eq!(r, e);
}
@@ -1801,7 +1800,7 @@ fn _mm256_sad_epu8() {
fn _mm256_sign_epi16() {
let a = i16x16::splat(2);
let b = i16x16::splat(-1);
let r = avx2::_mm256_sign_epi16(a, b);
let r = unsafe { avx2::_mm256_sign_epi16(a, b) };
let e = i16x16::splat(-2);
assert_eq!(r, e);
}
@@ -1810,7 +1809,7 @@ fn _mm256_sign_epi16() {
fn _mm256_sign_epi32() {
let a = i32x8::splat(2);
let b = i32x8::splat(-1);
let r = avx2::_mm256_sign_epi32(a, b);
let r = unsafe { avx2::_mm256_sign_epi32(a, b) };
let e = i32x8::splat(-2);
assert_eq!(r, e);
}
@@ -1819,53 +1818,53 @@ fn _mm256_sign_epi32() {
fn _mm256_sign_epi8() {
let a = i8x32::splat(2);
let b = i8x32::splat(-1);
let r = avx2::_mm256_sign_epi8(a, b);
let r = unsafe { avx2::_mm256_sign_epi8(a, b) };
let e = i8x32::splat(-2);
assert_eq!(r, e);
}
#[simd_test = "avx2"]
fn _mm256_sll_epi16() {
assert_eq!(
avx2::_mm256_sll_epi16(i16x16::splat(0xFF), i16x8::splat(0).replace(0,4)),
i16x16::splat(0xFF0));
let a = i16x16::splat(0xFF);
let b = i16x8::splat(0).replace(0, 4);
let r = unsafe { avx2::_mm256_sll_epi16(a, b) };
assert_eq!(r, i16x16::splat(0xFF0));
}
#[simd_test = "avx2"]
fn _mm256_sll_epi32() {
assert_eq!(
avx2::_mm256_sll_epi32(i32x8::splat(0xFFFF), i32x4::splat(0).replace(0,4)),
i32x8::splat(0xFFFF0));
let a = i32x8::splat(0xFFFF);
let b = i32x4::splat(0).replace(0, 4);
let r = unsafe { avx2::_mm256_sll_epi32(a, b) };
assert_eq!(r, i32x8::splat(0xFFFF0));
}
#[simd_test = "avx2"]
fn _mm256_sll_epi64() {
assert_eq!(
avx2::_mm256_sll_epi64(i64x4::splat(0xFFFFFFFF), i64x2::splat(0).replace(0,4)),
i64x4::splat(0xFFFFFFFF0));
let a = i64x4::splat(0xFFFFFFFF);
let b = i64x2::splat(0).replace(0, 4);
let r = unsafe { avx2::_mm256_sll_epi64(a, b) };
assert_eq!(r, i64x4::splat(0xFFFFFFFF0));
}
#[simd_test = "avx2"]
fn _mm256_slli_epi16() {
assert_eq!(
avx2::_mm256_slli_epi16(i16x16::splat(0xFF), 4),
unsafe { avx2::_mm256_slli_epi16(i16x16::splat(0xFF), 4) },
i16x16::splat(0xFF0));
}
#[simd_test = "avx2"]
fn _mm256_slli_epi32() {
assert_eq!(
avx2::_mm256_slli_epi32(i32x8::splat(0xFFFF), 4),
unsafe { avx2::_mm256_slli_epi32(i32x8::splat(0xFFFF), 4) },
i32x8::splat(0xFFFF0));
}
#[simd_test = "avx2"]
fn _mm256_slli_epi64() {
assert_eq!(
avx2::_mm256_slli_epi64(i64x4::splat(0xFFFFFFFF), 4),
unsafe { avx2::_mm256_slli_epi64(i64x4::splat(0xFFFFFFFF), 4) },
i64x4::splat(0xFFFFFFFF0));
}
@@ -1873,7 +1872,7 @@ fn _mm256_slli_epi64() {
fn _mm_sllv_epi32() {
let a = i32x4::splat(2);
let b = i32x4::splat(1);
let r = avx2::_mm_sllv_epi32(a, b);
let r = unsafe { avx2::_mm_sllv_epi32(a, b) };
let e = i32x4::splat(4);
assert_eq!(r, e);
}
@@ -1882,7 +1881,7 @@ fn _mm_sllv_epi32() {
fn _mm256_sllv_epi32() {
let a = i32x8::splat(2);
let b = i32x8::splat(1);
let r = avx2::_mm256_sllv_epi32(a, b);
let r = unsafe { avx2::_mm256_sllv_epi32(a, b) };
let e = i32x8::splat(4);
assert_eq!(r, e);
}
@@ -1891,7 +1890,7 @@ fn _mm256_sllv_epi32() {
fn _mm_sllv_epi64() {
let a = i64x2::splat(2);
let b = i64x2::splat(1);
let r = avx2::_mm_sllv_epi64(a, b);
let r = unsafe { avx2::_mm_sllv_epi64(a, b) };
let e = i64x2::splat(4);
assert_eq!(r, e);
}
@@ -1900,46 +1899,46 @@ fn _mm_sllv_epi64() {
fn _mm256_sllv_epi64() {
let a = i64x4::splat(2);
let b = i64x4::splat(1);
let r = avx2::_mm256_sllv_epi64(a, b);
let r = unsafe { avx2::_mm256_sllv_epi64(a, b) };
let e = i64x4::splat(4);
assert_eq!(r, e);
}
#[simd_test = "avx2"]
fn _mm256_sra_epi16() {
assert_eq!(
avx2::_mm256_sra_epi16(
i16x16::splat(-1), i16x8::new(1, 0, 0, 0, 0, 0, 0, 0)),
i16x16::splat(-1));
let a = i16x16::splat(-1);
let b = i16x8::new(1, 0, 0, 0, 0, 0, 0, 0);
let r = unsafe { avx2::_mm256_sra_epi16(a, b) };
assert_eq!(r, i16x16::splat(-1));
}
#[simd_test = "avx2"]
fn _mm256_sra_epi32() {
assert_eq!(
avx2::_mm256_sra_epi32(
i32x8::splat(-1), i32x4::splat(0).replace(0,1)),
i32x8::splat(-1));
let a = i32x8::splat(-1);
let b = i32x4::splat(0).replace(0, 1);
let r = unsafe { avx2::_mm256_sra_epi32(a, b) };
assert_eq!(r, i32x8::splat(-1));
}
#[simd_test = "avx2"]
fn _mm256_srai_epi16() {
assert_eq!(
avx2::_mm256_srai_epi16(
i16x16::splat(-1), 1), i16x16::splat(-1));
assert_eq!(
unsafe { avx2::_mm256_srai_epi16(i16x16::splat(-1), 1) },
i16x16::splat(-1));
}
#[simd_test = "avx2"]
fn _mm256_srai_epi32() {
assert_eq!(
avx2::_mm256_srai_epi32(
i32x8::splat(-1), 1), i32x8::splat(-1));
assert_eq!(
unsafe { avx2::_mm256_srai_epi32(i32x8::splat(-1), 1) },
i32x8::splat(-1));
}
#[simd_test = "avx2"]
fn _mm_srav_epi32() {
let a = i32x4::splat(4);
let count = i32x4::splat(1);
let r = avx2::_mm_srav_epi32(a, count);
let r = unsafe { avx2::_mm_srav_epi32(a, count) };
let e = i32x4::splat(2);
assert_eq!(r, e );
}
@@ -1948,53 +1947,53 @@ fn _mm_srav_epi32() {
fn _mm256_srav_epi32() {
let a = i32x8::splat(4);
let count = i32x8::splat(1);
let r = avx2::_mm256_srav_epi32(a, count);
let r = unsafe { avx2::_mm256_srav_epi32(a, count) };
let e = i32x8::splat(2);
assert_eq!(r, e );
}
#[simd_test = "avx2"]
fn _mm256_srl_epi16() {
assert_eq!(
avx2::_mm256_srl_epi16(
i16x16::splat(0xFF), i16x8::splat(0).replace(0,4)),
i16x16::splat(0xF));
let a = i16x16::splat(0xFF);
let b = i16x8::splat(0).replace(0, 4);
let r = unsafe { avx2::_mm256_srl_epi16(a, b) };
assert_eq!(r, i16x16::splat(0xF));
}
#[simd_test = "avx2"]
fn _mm256_srl_epi32() {
assert_eq!(
avx2::_mm256_srl_epi32(
i32x8::splat(0xFFFF), i32x4::splat(0).replace(0,4)),
i32x8::splat(0xFFF));
let a = i32x8::splat(0xFFFF);
let b = i32x4::splat(0).replace(0, 4);
let r = unsafe { avx2::_mm256_srl_epi32(a, b) };
assert_eq!(r, i32x8::splat(0xFFF));
}
#[simd_test = "avx2"]
fn _mm256_srl_epi64() {
assert_eq!(
avx2::_mm256_srl_epi64(
i64x4::splat(0xFFFFFFFF), i64x2::splat(0).replace(0,4)),
i64x4::splat(0xFFFFFFF));
let a = i64x4::splat(0xFFFFFFFF);
let b = i64x2::splat(0).replace(0, 4);
let r = unsafe { avx2::_mm256_srl_epi64(a, b) };
assert_eq!(r, i64x4::splat(0xFFFFFFF));
}
#[simd_test = "avx2"]
fn _mm256_srli_epi16() {
assert_eq!(
avx2::_mm256_srli_epi16(i16x16::splat(0xFF), 4),
unsafe { avx2::_mm256_srli_epi16(i16x16::splat(0xFF), 4) },
i16x16::splat(0xF));
}
#[simd_test = "avx2"]
fn _mm256_srli_epi32() {
assert_eq!(
avx2::_mm256_srli_epi32(i32x8::splat(0xFFFF), 4),
unsafe { avx2::_mm256_srli_epi32(i32x8::splat(0xFFFF), 4) },
i32x8::splat(0xFFF));
}
#[simd_test = "avx2"]
fn _mm256_srli_epi64() {
assert_eq!(
avx2::_mm256_srli_epi64(i64x4::splat(0xFFFFFFFF), 4),
unsafe { avx2::_mm256_srli_epi64(i64x4::splat(0xFFFFFFFF), 4) },
i64x4::splat(0xFFFFFFF));
}
@@ -2002,7 +2001,7 @@ fn _mm256_srli_epi64() {
fn _mm_srlv_epi32() {
let a = i32x4::splat(2);
let count = i32x4::splat(1);
let r = avx2::_mm_srlv_epi32(a, count);
let r = unsafe { avx2::_mm_srlv_epi32(a, count) };
let e = i32x4::splat(1);
assert_eq!(r, e);
}
@@ -2011,7 +2010,7 @@ fn _mm_srlv_epi32() {
fn _mm256_srlv_epi32() {
let a = i32x8::splat(2);
let count = i32x8::splat(1);
let r = avx2::_mm256_srlv_epi32(a, count);
let r = unsafe { avx2::_mm256_srlv_epi32(a, count) };
let e = i32x8::splat(1);
assert_eq!(r, e);
}
@@ -2020,7 +2019,7 @@ fn _mm256_srlv_epi32() {
fn _mm_srlv_epi64() {
let a = i64x2::splat(2);
let count = i64x2::splat(1);
let r = avx2::_mm_srlv_epi64(a, count);
let r = unsafe { avx2::_mm_srlv_epi64(a, count) };
let e = i64x2::splat(1);
assert_eq!(r, e);
}
@@ -2030,7 +2029,7 @@ fn _mm_srlv_epi64() {
fn _mm256_srlv_epi64() {
let a = i64x4::splat(2);
let count = i64x4::splat(1);
let r = avx2::_mm256_srlv_epi64(a, count);
let r = unsafe { avx2::_mm256_srlv_epi64(a, count) };
let e = i64x4::splat(1);
assert_eq!(r, e);
}
@@ -2039,7 +2038,7 @@ fn _mm256_srlv_epi64() {
fn _mm256_sub_epi16() {
let a = i16x16::splat(4);
let b = i16x16::splat(2);
let r = avx2::_mm256_sub_epi16(a, b);
let r = unsafe { avx2::_mm256_sub_epi16(a, b) };
assert_eq!(r, b);
}
@@ -2047,7 +2046,7 @@ fn _mm256_sub_epi16() {
fn _mm256_sub_epi32() {
let a = i32x8::splat(4);
let b = i32x8::splat(2);
let r = avx2::_mm256_sub_epi32(a, b);
let r = unsafe { avx2::_mm256_sub_epi32(a, b) };
assert_eq!(r, b);
}
@@ -2055,7 +2054,7 @@ fn _mm256_sub_epi32() {
fn _mm256_sub_epi64() {
let a = i64x4::splat(4);
let b = i64x4::splat(2);
let r = avx2::_mm256_sub_epi64(a, b);
let r = unsafe { avx2::_mm256_sub_epi64(a, b) };
assert_eq!(r, b);
}
@@ -2063,7 +2062,7 @@ fn _mm256_sub_epi64() {
fn _mm256_sub_epi8() {
let a = i8x32::splat(4);
let b = i8x32::splat(2);
let r = avx2::_mm256_sub_epi8(a, b);
let r = unsafe { avx2::_mm256_sub_epi8(a, b) };
assert_eq!(r, b);
}
@@ -2071,7 +2070,7 @@ fn _mm256_sub_epi8() {
fn _mm256_subs_epi16() {
let a = i16x16::splat(4);
let b = i16x16::splat(2);
let r = avx2::_mm256_subs_epi16(a, b);
let r = unsafe { avx2::_mm256_subs_epi16(a, b) };
assert_eq!(r, b);
}
@@ -2079,7 +2078,7 @@ fn _mm256_subs_epi16() {
fn _mm256_subs_epi8() {
let a = i8x32::splat(4);
let b = i8x32::splat(2);
let r = avx2::_mm256_subs_epi8(a, b);
let r = unsafe { avx2::_mm256_subs_epi8(a, b) };
assert_eq!(r, b);
}
@@ -2087,7 +2086,7 @@ fn _mm256_subs_epi8() {
fn _mm256_subs_epu16() {
let a = u16x16::splat(4);
let b = u16x16::splat(2);
let r = avx2::_mm256_subs_epu16(a, b);
let r = unsafe { avx2::_mm256_subs_epu16(a, b) };
assert_eq!(r, b);
}
@@ -2095,14 +2094,15 @@ fn _mm256_subs_epu16() {
fn _mm256_subs_epu8() {
let a = u8x32::splat(4);
let b = u8x32::splat(2);
let r = avx2::_mm256_subs_epu8(a, b);
let r = unsafe { avx2::_mm256_subs_epu8(a, b) };
assert_eq!(r, b);
}
#[simd_test = "avx2"]
fn _mm256_xor_si256() {
assert_eq!(
avx2::_mm256_xor_si256(__m256i::splat(5), __m256i::splat(3)),
__m256i::splat(6));
let a = __m256i::splat(5);
let b = __m256i::splat(3);
let r = unsafe { avx2::_mm256_xor_si256(a, b) };
assert_eq!(r, __m256i::splat(6));
}
}
+88 -64
View File
@@ -10,20 +10,12 @@
#[cfg(test)]
use stdsimd_test::assert_instr;
#[allow(dead_code)]
extern "C" {
#[link_name="llvm.x86.bmi.bextr.32"]
fn x86_bmi_bextr_32(x: u32, y: u32) -> u32;
#[link_name="llvm.x86.bmi.bextr.64"]
fn x86_bmi_bextr_64(x: u64, y: u64) -> u64;
}
/// Extracts bits in range [`start`, `start` + `length`) from `a` into
/// the least significant bits of the result.
#[inline(always)]
#[target_feature = "+bmi"]
#[cfg_attr(test, assert_instr(bextr))]
pub fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 {
pub unsafe fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 {
_bextr2_u32(a, (start & 0xffu32) | ((len & 0xffu32) << 8u32))
}
@@ -33,7 +25,7 @@ pub fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 {
#[target_feature = "+bmi"]
#[cfg_attr(test, assert_instr(bextr))]
#[cfg(not(target_arch = "x86"))]
pub fn _bextr_u64(a: u64, start: u64, len: u64) -> u64 {
pub unsafe fn _bextr_u64(a: u64, start: u64, len: u64) -> u64 {
_bextr2_u64(a, (start & 0xffu64) | ((len & 0xffu64) << 8u64))
}
@@ -45,8 +37,8 @@ pub fn _bextr_u64(a: u64, start: u64, len: u64) -> u64 {
#[inline(always)]
#[target_feature = "+bmi"]
#[cfg_attr(test, assert_instr(bextr))]
pub fn _bextr2_u32(a: u32, control: u32) -> u32 {
unsafe { x86_bmi_bextr_32(a, control) }
pub unsafe fn _bextr2_u32(a: u32, control: u32) -> u32 {
x86_bmi_bextr_32(a, control)
}
/// Extracts bits of `a` specified by `control` into
@@ -58,15 +50,15 @@ pub fn _bextr2_u32(a: u32, control: u32) -> u32 {
#[target_feature = "+bmi"]
#[cfg_attr(test, assert_instr(bextr))]
#[cfg(not(target_arch = "x86"))]
pub fn _bextr2_u64(a: u64, control: u64) -> u64 {
unsafe { x86_bmi_bextr_64(a, control) }
pub unsafe fn _bextr2_u64(a: u64, control: u64) -> u64 {
x86_bmi_bextr_64(a, control)
}
/// Bitwise logical `AND` of inverted `a` with `b`.
#[inline(always)]
#[target_feature = "+bmi"]
#[cfg_attr(test, assert_instr(andn))]
pub fn _andn_u32(a: u32, b: u32) -> u32 {
pub unsafe fn _andn_u32(a: u32, b: u32) -> u32 {
!a & b
}
@@ -74,7 +66,7 @@ pub fn _andn_u32(a: u32, b: u32) -> u32 {
#[inline(always)]
#[target_feature = "+bmi"]
#[cfg_attr(test, assert_instr(andn))]
pub fn _andn_u64(a: u64, b: u64) -> u64 {
pub unsafe fn _andn_u64(a: u64, b: u64) -> u64 {
!a & b
}
@@ -82,7 +74,7 @@ pub fn _andn_u64(a: u64, b: u64) -> u64 {
#[inline(always)]
#[target_feature = "+bmi"]
#[cfg_attr(test, assert_instr(blsi))]
pub fn _blsi_u32(x: u32) -> u32 {
pub unsafe fn _blsi_u32(x: u32) -> u32 {
x & x.wrapping_neg()
}
@@ -91,7 +83,7 @@ pub fn _blsi_u32(x: u32) -> u32 {
#[target_feature = "+bmi"]
#[cfg_attr(test, assert_instr(blsi))]
#[cfg(not(target_arch = "x86"))] // generates lots of instructions
pub fn _blsi_u64(x: u64) -> u64 {
pub unsafe fn _blsi_u64(x: u64) -> u64 {
x & x.wrapping_neg()
}
@@ -99,7 +91,7 @@ pub fn _blsi_u64(x: u64) -> u64 {
#[inline(always)]
#[target_feature = "+bmi"]
#[cfg_attr(test, assert_instr(blsmsk))]
pub fn _blsmsk_u32(x: u32) -> u32 {
pub unsafe fn _blsmsk_u32(x: u32) -> u32 {
x ^ (x.wrapping_sub(1u32))
}
@@ -108,7 +100,7 @@ pub fn _blsmsk_u32(x: u32) -> u32 {
#[target_feature = "+bmi"]
#[cfg_attr(test, assert_instr(blsmsk))]
#[cfg(not(target_arch = "x86"))] // generates lots of instructions
pub fn _blsmsk_u64(x: u64) -> u64 {
pub unsafe fn _blsmsk_u64(x: u64) -> u64 {
x ^ (x.wrapping_sub(1u64))
}
@@ -118,7 +110,7 @@ pub fn _blsmsk_u64(x: u64) -> u64 {
#[inline(always)]
#[target_feature = "+bmi"]
#[cfg_attr(test, assert_instr(blsr))]
pub fn _blsr_u32(x: u32) -> u32 {
pub unsafe fn _blsr_u32(x: u32) -> u32 {
x & (x.wrapping_sub(1))
}
@@ -129,7 +121,7 @@ pub fn _blsr_u32(x: u32) -> u32 {
#[target_feature = "+bmi"]
#[cfg_attr(test, assert_instr(blsr))]
#[cfg(not(target_arch = "x86"))] // generates lots of instructions
pub fn _blsr_u64(x: u64) -> u64 {
pub unsafe fn _blsr_u64(x: u64) -> u64 {
x & (x.wrapping_sub(1))
}
@@ -139,7 +131,7 @@ pub fn _blsr_u64(x: u64) -> u64 {
#[inline(always)]
#[target_feature = "+bmi"]
#[cfg_attr(test, assert_instr(tzcnt))]
pub fn _tzcnt_u16(x: u16) -> u16 {
pub unsafe fn _tzcnt_u16(x: u16) -> u16 {
x.trailing_zeros() as u16
}
@@ -149,7 +141,7 @@ pub fn _tzcnt_u16(x: u16) -> u16 {
#[inline(always)]
#[target_feature = "+bmi"]
#[cfg_attr(test, assert_instr(tzcnt))]
pub fn _tzcnt_u32(x: u32) -> u32 {
pub unsafe fn _tzcnt_u32(x: u32) -> u32 {
x.trailing_zeros()
}
@@ -159,7 +151,7 @@ pub fn _tzcnt_u32(x: u32) -> u32 {
#[inline(always)]
#[target_feature = "+bmi"]
#[cfg_attr(test, assert_instr(tzcnt))]
pub fn _tzcnt_u64(x: u64) -> u64 {
pub unsafe fn _tzcnt_u64(x: u64) -> u64 {
x.trailing_zeros() as u64
}
@@ -169,7 +161,7 @@ pub fn _tzcnt_u64(x: u64) -> u64 {
#[inline(always)]
#[target_feature = "+bmi"]
#[cfg_attr(test, assert_instr(tzcnt))]
pub fn _mm_tzcnt_u32(x: u32) -> u32 {
pub unsafe fn _mm_tzcnt_u32(x: u32) -> u32 {
x.trailing_zeros()
}
@@ -179,10 +171,18 @@ pub fn _mm_tzcnt_u32(x: u32) -> u32 {
#[inline(always)]
#[target_feature = "+bmi"]
#[cfg_attr(test, assert_instr(tzcnt))]
pub fn _mm_tzcnt_u64(x: u64) -> u64 {
pub unsafe fn _mm_tzcnt_u64(x: u64) -> u64 {
x.trailing_zeros() as u64
}
#[allow(dead_code)]
extern "C" {
#[link_name="llvm.x86.bmi.bextr.32"]
fn x86_bmi_bextr_32(x: u32, y: u32) -> u32;
#[link_name="llvm.x86.bmi.bextr.64"]
fn x86_bmi_bextr_64(x: u64, y: u64) -> u64;
}
#[cfg(test)]
mod tests {
use stdsimd_test::simd_test;
@@ -191,98 +191,122 @@ mod tests {
#[simd_test = "bmi"]
fn _bextr_u32() {
assert_eq!(bmi::_bextr_u32(0b0101_0000u32, 4, 4), 0b0000_0101u32);
let r = unsafe { bmi::_bextr_u32(0b0101_0000u32, 4, 4) };
assert_eq!(r, 0b0000_0101u32);
}
#[simd_test = "bmi"]
#[cfg(not(target_arch = "x86"))]
fn _bextr_u64() {
assert_eq!(bmi::_bextr_u64(0b0101_0000u64, 4, 4), 0b0000_0101u64);
let r = unsafe { bmi::_bextr_u64(0b0101_0000u64, 4, 4) };
assert_eq!(r, 0b0000_0101u64);
}
#[simd_test = "bmi"]
fn _andn_u32() {
assert_eq!(bmi::_andn_u32(0, 0), 0);
assert_eq!(bmi::_andn_u32(0, 1), 1);
assert_eq!(bmi::_andn_u32(1, 0), 0);
assert_eq!(bmi::_andn_u32(1, 1), 0);
assert_eq!(unsafe { bmi::_andn_u32(0, 0) }, 0);
assert_eq!(unsafe { bmi::_andn_u32(0, 1) }, 1);
assert_eq!(unsafe { bmi::_andn_u32(1, 0) }, 0);
assert_eq!(unsafe { bmi::_andn_u32(1, 1) }, 0);
assert_eq!(bmi::_andn_u32(0b0000_0000u32, 0b0000_0000u32), 0b0000_0000u32);
assert_eq!(bmi::_andn_u32(0b0000_0000u32, 0b1111_1111u32), 0b1111_1111u32);
assert_eq!(bmi::_andn_u32(0b1111_1111u32, 0b0000_0000u32), 0b0000_0000u32);
assert_eq!(bmi::_andn_u32(0b1111_1111u32, 0b1111_1111u32), 0b0000_0000u32);
assert_eq!(bmi::_andn_u32(0b0100_0000u32, 0b0101_1101u32), 0b0001_1101u32);
let r = unsafe { bmi::_andn_u32(0b0000_0000u32, 0b0000_0000u32) };
assert_eq!(r, 0b0000_0000u32);
let r = unsafe { bmi::_andn_u32(0b0000_0000u32, 0b1111_1111u32) };
assert_eq!(r, 0b1111_1111u32);
let r = unsafe { bmi::_andn_u32(0b1111_1111u32, 0b0000_0000u32) };
assert_eq!(r, 0b0000_0000u32);
let r = unsafe { bmi::_andn_u32(0b1111_1111u32, 0b1111_1111u32) };
assert_eq!(r, 0b0000_0000u32);
let r = unsafe { bmi::_andn_u32(0b0100_0000u32, 0b0101_1101u32) };
assert_eq!(r, 0b0001_1101u32);
}
#[simd_test = "bmi"]
#[cfg(not(target_arch = "x86"))]
fn _andn_u64() {
assert_eq!(bmi::_andn_u64(0, 0), 0);
assert_eq!(bmi::_andn_u64(0, 1), 1);
assert_eq!(bmi::_andn_u64(1, 0), 0);
assert_eq!(bmi::_andn_u64(1, 1), 0);
assert_eq!(unsafe { bmi::_andn_u64(0, 0) }, 0);
assert_eq!(unsafe { bmi::_andn_u64(0, 1) }, 1);
assert_eq!(unsafe { bmi::_andn_u64(1, 0) }, 0);
assert_eq!(unsafe { bmi::_andn_u64(1, 1) }, 0);
assert_eq!(bmi::_andn_u64(0b0000_0000u64, 0b0000_0000u64), 0b0000_0000u64);
assert_eq!(bmi::_andn_u64(0b0000_0000u64, 0b1111_1111u64), 0b1111_1111u64);
assert_eq!(bmi::_andn_u64(0b1111_1111u64, 0b0000_0000u64), 0b0000_0000u64);
assert_eq!(bmi::_andn_u64(0b1111_1111u64, 0b1111_1111u64), 0b0000_0000u64);
assert_eq!(bmi::_andn_u64(0b0100_0000u64, 0b0101_1101u64), 0b0001_1101u64);
let r = unsafe { bmi::_andn_u64(0b0000_0000u64, 0b0000_0000u64) };
assert_eq!(r, 0b0000_0000u64);
let r = unsafe { bmi::_andn_u64(0b0000_0000u64, 0b1111_1111u64) };
assert_eq!(r, 0b1111_1111u64);
let r = unsafe { bmi::_andn_u64(0b1111_1111u64, 0b0000_0000u64) };
assert_eq!(r, 0b0000_0000u64);
let r = unsafe { bmi::_andn_u64(0b1111_1111u64, 0b1111_1111u64) };
assert_eq!(r, 0b0000_0000u64);
let r = unsafe { bmi::_andn_u64(0b0100_0000u64, 0b0101_1101u64) };
assert_eq!(r, 0b0001_1101u64);
}
#[simd_test = "bmi"]
fn _blsi_u32() {
assert_eq!(bmi::_blsi_u32(0b1101_0000u32), 0b0001_0000u32);
assert_eq!(unsafe { bmi::_blsi_u32(0b1101_0000u32) }, 0b0001_0000u32);
}
#[simd_test = "bmi"]
#[cfg(not(target_arch = "x86"))]
fn _blsi_u64() {
assert_eq!(bmi::_blsi_u64(0b1101_0000u64), 0b0001_0000u64);
assert_eq!(unsafe { bmi::_blsi_u64(0b1101_0000u64) }, 0b0001_0000u64);
}
#[simd_test = "bmi"]
fn _blsmsk_u32() {
assert_eq!(bmi::_blsmsk_u32(0b0011_0000u32), 0b0001_1111u32);
let r = unsafe { bmi::_blsmsk_u32(0b0011_0000u32) };
assert_eq!(r, 0b0001_1111u32);
}
#[simd_test = "bmi"]
#[cfg(not(target_arch = "x86"))]
fn _blsmsk_u64() {
assert_eq!(bmi::_blsmsk_u64(0b0011_0000u64), 0b0001_1111u64);
let r = unsafe { bmi::_blsmsk_u64(0b0011_0000u64) };
assert_eq!(r, 0b0001_1111u64);
}
#[simd_test = "bmi"]
fn _blsr_u32() {
/// TODO: test the behavior when the input is 0
assert_eq!(bmi::_blsr_u32(0b0011_0000u32), 0b0010_0000u32);
// TODO: test the behavior when the input is 0
let r = unsafe { bmi::_blsr_u32(0b0011_0000u32) };
assert_eq!(r, 0b0010_0000u32);
}
#[simd_test = "bmi"]
#[cfg(not(target_arch = "x86"))]
fn _blsr_u64() {
/// TODO: test the behavior when the input is 0
assert_eq!(bmi::_blsr_u64(0b0011_0000u64), 0b0010_0000u64);
// TODO: test the behavior when the input is 0
let r = unsafe { bmi::_blsr_u64(0b0011_0000u64) };
assert_eq!(r, 0b0010_0000u64);
}
#[simd_test = "bmi"]
fn _tzcnt_u16() {
assert_eq!(bmi::_tzcnt_u16(0b0000_0001u16), 0u16);
assert_eq!(bmi::_tzcnt_u16(0b0000_0000u16), 16u16);
assert_eq!(bmi::_tzcnt_u16(0b1001_0000u16), 4u16);
assert_eq!(unsafe { bmi::_tzcnt_u16(0b0000_0001u16) }, 0u16);
assert_eq!(unsafe { bmi::_tzcnt_u16(0b0000_0000u16) }, 16u16);
assert_eq!(unsafe { bmi::_tzcnt_u16(0b1001_0000u16) }, 4u16);
}
#[simd_test = "bmi"]
fn _tzcnt_u32() {
assert_eq!(bmi::_tzcnt_u32(0b0000_0001u32), 0u32);
assert_eq!(bmi::_tzcnt_u32(0b0000_0000u32), 32u32);
assert_eq!(bmi::_tzcnt_u32(0b1001_0000u32), 4u32);
assert_eq!(unsafe { bmi::_tzcnt_u32(0b0000_0001u32) }, 0u32);
assert_eq!(unsafe { bmi::_tzcnt_u32(0b0000_0000u32) }, 32u32);
assert_eq!(unsafe { bmi::_tzcnt_u32(0b1001_0000u32) }, 4u32);
}
#[simd_test = "bmi"]
#[cfg(not(target_arch = "x86"))]
fn _tzcnt_u64() {
assert_eq!(bmi::_tzcnt_u64(0b0000_0001u64), 0u64);
assert_eq!(bmi::_tzcnt_u64(0b0000_0000u64), 64u64);
assert_eq!(bmi::_tzcnt_u64(0b1001_0000u64), 4u64);
assert_eq!(unsafe { bmi::_tzcnt_u64(0b0000_0001u64) }, 0u64);
assert_eq!(unsafe { bmi::_tzcnt_u64(0b0000_0000u64) }, 64u64);
assert_eq!(unsafe { bmi::_tzcnt_u64(0b1001_0000u64) }, 4u64);
}
}
+69 -71
View File
@@ -19,7 +19,7 @@
#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(imul))]
#[cfg_attr(all(test, target_arch = "x86"), assert_instr(mulx))]
#[target_feature = "+bmi2"]
pub fn _mulx_u32(a: u32, b: u32) -> (u32, u32) {
pub unsafe fn _mulx_u32(a: u32, b: u32) -> (u32, u32) {
let result: u64 = (a as u64) * (b as u64);
let hi = (result >> 32) as u32;
(result as u32, hi)
@@ -33,12 +33,67 @@ pub fn _mulx_u32(a: u32, b: u32) -> (u32, u32) {
#[cfg_attr(test, assert_instr(mulx))]
#[target_feature = "+bmi2"]
#[cfg(not(target_arch = "x86"))] // calls an intrinsic
pub fn _mulx_u64(a: u64, b: u64) -> (u64, u64) {
pub unsafe fn _mulx_u64(a: u64, b: u64) -> (u64, u64) {
let result: u128 = (a as u128) * (b as u128);
let hi = (result >> 64) as u64;
(result as u64, hi)
}
/// Zero higher bits of `a` >= `index`.
#[inline(always)]
#[target_feature = "+bmi2"]
#[cfg_attr(test, assert_instr(bzhi))]
pub unsafe fn _bzhi_u32(a: u32, index: u32) -> u32 {
x86_bmi2_bzhi_32(a, index)
}
/// Zero higher bits of `a` >= `index`.
#[inline(always)]
#[target_feature = "+bmi2"]
#[cfg_attr(test, assert_instr(bzhi))]
#[cfg(not(target_arch = "x86"))]
pub unsafe fn _bzhi_u64(a: u64, index: u64) -> u64 {
x86_bmi2_bzhi_64(a, index)
}
/// Scatter contiguous low order bits of `a` to the result at the positions
/// specified by the `mask`.
#[inline(always)]
#[target_feature = "+bmi2"]
#[cfg_attr(test, assert_instr(pdep))]
pub unsafe fn _pdep_u32(a: u32, mask: u32) -> u32 {
x86_bmi2_pdep_32(a, mask)
}
/// Scatter contiguous low order bits of `a` to the result at the positions
/// specified by the `mask`.
#[inline(always)]
#[target_feature = "+bmi2"]
#[cfg_attr(test, assert_instr(pdep))]
#[cfg(not(target_arch = "x86"))]
pub unsafe fn _pdep_u64(a: u64, mask: u64) -> u64 {
x86_bmi2_pdep_64(a, mask)
}
/// Gathers the bits of `x` specified by the `mask` into the contiguous low
/// order bit positions of the result.
#[inline(always)]
#[target_feature = "+bmi2"]
#[cfg_attr(test, assert_instr(pext))]
pub unsafe fn _pext_u32(a: u32, mask: u32) -> u32 {
x86_bmi2_pext_32(a, mask)
}
/// Gathers the bits of `x` specified by the `mask` into the contiguous low
/// order bit positions of the result.
#[inline(always)]
#[target_feature = "+bmi2"]
#[cfg_attr(test, assert_instr(pext))]
#[cfg(not(target_arch = "x86"))]
pub unsafe fn _pext_u64(a: u64, mask: u64) -> u64 {
x86_bmi2_pext_64(a, mask)
}
#[allow(dead_code)]
extern "C" {
#[link_name="llvm.x86.bmi.bzhi.32"]
@@ -55,63 +110,6 @@ pub fn _mulx_u64(a: u64, b: u64) -> (u64, u64) {
fn x86_bmi2_pext_64(x: u64, y: u64) -> u64;
}
/// Zero higher bits of `a` >= `index`.
#[inline(always)]
#[target_feature = "+bmi2"]
#[cfg_attr(test, assert_instr(bzhi))]
pub fn _bzhi_u32(a: u32, index: u32) -> u32 {
unsafe { x86_bmi2_bzhi_32(a, index) }
}
/// Zero higher bits of `a` >= `index`.
#[inline(always)]
#[target_feature = "+bmi2"]
#[cfg_attr(test, assert_instr(bzhi))]
#[cfg(not(target_arch = "x86"))]
pub fn _bzhi_u64(a: u64, index: u64) -> u64 {
unsafe { x86_bmi2_bzhi_64(a, index) }
}
/// Scatter contiguous low order bits of `a` to the result at the positions
/// specified by the `mask`.
#[inline(always)]
#[target_feature = "+bmi2"]
#[cfg_attr(test, assert_instr(pdep))]
pub fn _pdep_u32(a: u32, mask: u32) -> u32 {
unsafe { x86_bmi2_pdep_32(a, mask) }
}
/// Scatter contiguous low order bits of `a` to the result at the positions
/// specified by the `mask`.
#[inline(always)]
#[target_feature = "+bmi2"]
#[cfg_attr(test, assert_instr(pdep))]
#[cfg(not(target_arch = "x86"))]
pub fn _pdep_u64(a: u64, mask: u64) -> u64 {
unsafe { x86_bmi2_pdep_64(a, mask) }
}
/// Gathers the bits of `x` specified by the `mask` into the contiguous low
/// order bit positions of the result.
#[inline(always)]
#[target_feature = "+bmi2"]
#[cfg_attr(test, assert_instr(pext))]
pub fn _pext_u32(a: u32, mask: u32) -> u32 {
unsafe { x86_bmi2_pext_32(a, mask) }
}
/// Gathers the bits of `x` specified by the `mask` into the contiguous low
/// order bit positions of the result.
#[inline(always)]
#[target_feature = "+bmi2"]
#[cfg_attr(test, assert_instr(pext))]
#[cfg(not(target_arch = "x86"))]
pub fn _pext_u64(a: u64, mask: u64) -> u64 {
unsafe { x86_bmi2_pext_64(a, mask) }
}
#[cfg(test)]
mod tests {
use stdsimd_test::simd_test;
@@ -128,8 +126,8 @@ fn _pext_u32() {
let m1 = 0b1110_1011_1110_1111u32;
let s1 = 0b0001_0111_0100_0011u32;
assert_eq!(bmi2::_pext_u32(n, m0), s0);
assert_eq!(bmi2::_pext_u32(n, m1), s1);
assert_eq!(unsafe { bmi2::_pext_u32(n, m0) }, s0);
assert_eq!(unsafe { bmi2::_pext_u32(n, m1) }, s1);
}
#[simd_test = "bmi2"]
@@ -143,8 +141,8 @@ fn _pext_u64() {
let m1 = 0b1110_1011_1110_1111u64;
let s1 = 0b0001_0111_0100_0011u64;
assert_eq!(bmi2::_pext_u64(n, m0), s0);
assert_eq!(bmi2::_pext_u64(n, m1), s1);
assert_eq!(unsafe { bmi2::_pext_u64(n, m0) }, s0);
assert_eq!(unsafe { bmi2::_pext_u64(n, m1) }, s1);
}
#[simd_test = "bmi2"]
@@ -157,8 +155,8 @@ fn _pdep_u32() {
let m1 = 0b1110_1011_1110_1111u32;
let s1 = 0b1110_1001_0010_0011u32;
assert_eq!(bmi2::_pdep_u32(n, m0), s0);
assert_eq!(bmi2::_pdep_u32(n, m1), s1);
assert_eq!(unsafe { bmi2::_pdep_u32(n, m0) }, s0);
assert_eq!(unsafe { bmi2::_pdep_u32(n, m1) }, s1);
}
#[simd_test = "bmi2"]
@@ -172,15 +170,15 @@ fn _pdep_u64() {
let m1 = 0b1110_1011_1110_1111u64;
let s1 = 0b1110_1001_0010_0011u64;
assert_eq!(bmi2::_pdep_u64(n, m0), s0);
assert_eq!(bmi2::_pdep_u64(n, m1), s1);
assert_eq!(unsafe { bmi2::_pdep_u64(n, m0) }, s0);
assert_eq!(unsafe { bmi2::_pdep_u64(n, m1) }, s1);
}
#[simd_test = "bmi2"]
fn _bzhi_u32() {
let n = 0b1111_0010u32;
let s = 0b0001_0010u32;
assert_eq!(bmi2::_bzhi_u32(n, 5), s);
assert_eq!(unsafe { bmi2::_bzhi_u32(n, 5) }, s);
}
#[simd_test = "bmi2"]
@@ -188,14 +186,14 @@ fn _bzhi_u32() {
fn _bzhi_u64() {
let n = 0b1111_0010u64;
let s = 0b0001_0010u64;
assert_eq!(bmi2::_bzhi_u64(n, 5), s);
assert_eq!(unsafe { bmi2::_bzhi_u64(n, 5) }, s);
}
#[simd_test = "bmi2"]
fn _mulx_u32() {
let a: u32 = 4_294_967_200;
let b: u32 = 2;
let (lo, hi): (u32, u32) = bmi2::_mulx_u32(a, b);
let (lo, hi): (u32, u32) = unsafe { bmi2::_mulx_u32(a, b) };
// result = 8589934400
// = 0b0001_1111_1111_1111_1111_1111_1111_0100_0000u64
// ^~hi ^~lo~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -208,7 +206,7 @@ fn _mulx_u32() {
fn _mulx_u64() {
let a: u64 = 9_223_372_036_854_775_800;
let b: u64 = 100;
let (lo, hi): (u64, u64) = bmi2::_mulx_u64(a, b);
let (lo, hi): (u64, u64) = unsafe { bmi2::_mulx_u64(a, b) };
// result = 922337203685477580000
// = 0b00110001_11111111_11111111_11111111_11111111_11111111_11111111_11111100_11100000u128
// ^~hi~~~~ ^~lo~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+80 -77
View File
@@ -9,15 +9,15 @@
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(addss))]
pub fn _mm_add_ss(a: f32x4, b: f32x4) -> f32x4 {
unsafe { addss(a, b) }
pub unsafe fn _mm_add_ss(a: f32x4, b: f32x4) -> f32x4 {
addss(a, b)
}
/// Adds f32x4 vectors.
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(addps))]
pub fn _mm_add_ps(a: f32x4, b: f32x4) -> f32x4 {
pub unsafe fn _mm_add_ps(a: f32x4, b: f32x4) -> f32x4 {
a + b
}
@@ -26,15 +26,15 @@ pub fn _mm_add_ps(a: f32x4, b: f32x4) -> f32x4 {
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(subss))]
pub fn _mm_sub_ss(a: f32x4, b: f32x4) -> f32x4 {
unsafe { subss(a, b) }
pub unsafe fn _mm_sub_ss(a: f32x4, b: f32x4) -> f32x4 {
subss(a, b)
}
/// Subtracts f32x4 vectors.
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(subps))]
pub fn _mm_sub_ps(a: f32x4, b: f32x4) -> f32x4 {
pub unsafe fn _mm_sub_ps(a: f32x4, b: f32x4) -> f32x4 {
a - b
}
@@ -43,15 +43,15 @@ pub fn _mm_sub_ps(a: f32x4, b: f32x4) -> f32x4 {
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(mulss))]
pub fn _mm_mul_ss(a: f32x4, b: f32x4) -> f32x4 {
unsafe { mulss(a, b) }
pub unsafe fn _mm_mul_ss(a: f32x4, b: f32x4) -> f32x4 {
mulss(a, b)
}
/// Multiplies f32x4 vectors.
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(mulps))]
pub fn _mm_mul_ps(a: f32x4, b: f32x4) -> f32x4 {
pub unsafe fn _mm_mul_ps(a: f32x4, b: f32x4) -> f32x4 {
a * b
}
@@ -60,15 +60,15 @@ pub fn _mm_mul_ps(a: f32x4, b: f32x4) -> f32x4 {
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(divss))]
pub fn _mm_div_ss(a: f32x4, b: f32x4) -> f32x4 {
unsafe { divss(a, b) }
pub unsafe fn _mm_div_ss(a: f32x4, b: f32x4) -> f32x4 {
divss(a, b)
}
/// Divides f32x4 vectors.
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(divps))]
pub fn _mm_div_ps(a: f32x4, b: f32x4) -> f32x4 {
pub unsafe fn _mm_div_ps(a: f32x4, b: f32x4) -> f32x4 {
a / b
}
@@ -77,8 +77,8 @@ pub fn _mm_div_ps(a: f32x4, b: f32x4) -> f32x4 {
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(sqrtss))]
pub fn _mm_sqrt_ss(a: f32x4) -> f32x4 {
unsafe { sqrtss(a) }
pub unsafe fn _mm_sqrt_ss(a: f32x4) -> f32x4 {
sqrtss(a)
}
/// Return the square root of packed single-precision (32-bit) floating-point
@@ -86,8 +86,8 @@ pub fn _mm_sqrt_ss(a: f32x4) -> f32x4 {
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(sqrtps))]
pub fn _mm_sqrt_ps(a: f32x4) -> f32x4 {
unsafe { sqrtps(a) }
pub unsafe fn _mm_sqrt_ps(a: f32x4) -> f32x4 {
sqrtps(a)
}
/// Return the approximate reciprocal of the first single-precision
@@ -95,8 +95,8 @@ pub fn _mm_sqrt_ps(a: f32x4) -> f32x4 {
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(rcpss))]
pub fn _mm_rcp_ss(a: f32x4) -> f32x4 {
unsafe { rcpss(a) }
pub unsafe fn _mm_rcp_ss(a: f32x4) -> f32x4 {
rcpss(a)
}
/// Return the approximate reciprocal of packed single-precision (32-bit)
@@ -104,8 +104,8 @@ pub fn _mm_rcp_ss(a: f32x4) -> f32x4 {
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(rcpps))]
pub fn _mm_rcp_ps(a: f32x4) -> f32x4 {
unsafe { rcpps(a) }
pub unsafe fn _mm_rcp_ps(a: f32x4) -> f32x4 {
rcpps(a)
}
/// Return the approximate reciprocal square root of the fist single-precision
@@ -113,8 +113,8 @@ pub fn _mm_rcp_ps(a: f32x4) -> f32x4 {
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(rsqrtss))]
pub fn _mm_rsqrt_ss(a: f32x4) -> f32x4 {
unsafe { rsqrtss(a) }
pub unsafe fn _mm_rsqrt_ss(a: f32x4) -> f32x4 {
rsqrtss(a)
}
/// Return the approximate reciprocal square root of packed single-precision
@@ -122,8 +122,8 @@ pub fn _mm_rsqrt_ss(a: f32x4) -> f32x4 {
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(rsqrtps))]
pub fn _mm_rsqrt_ps(a: f32x4) -> f32x4 {
unsafe { rsqrtps(a) }
pub unsafe fn _mm_rsqrt_ps(a: f32x4) -> f32x4 {
rsqrtps(a)
}
/// Compare the first single-precision (32-bit) floating-point element of `a`
@@ -132,8 +132,8 @@ pub fn _mm_rsqrt_ps(a: f32x4) -> f32x4 {
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(minss))]
pub fn _mm_min_ss(a: f32x4, b: f32x4) -> f32x4 {
unsafe { minss(a, b) }
pub unsafe fn _mm_min_ss(a: f32x4, b: f32x4) -> f32x4 {
minss(a, b)
}
/// Compare packed single-precision (32-bit) floating-point elements in `a` and
@@ -141,8 +141,8 @@ pub fn _mm_min_ss(a: f32x4, b: f32x4) -> f32x4 {
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(minps))]
pub fn _mm_min_ps(a: f32x4, b: f32x4) -> f32x4 {
unsafe { minps(a, b) }
pub unsafe fn _mm_min_ps(a: f32x4, b: f32x4) -> f32x4 {
minps(a, b)
}
/// Compare the first single-precision (32-bit) floating-point element of `a`
@@ -151,8 +151,8 @@ pub fn _mm_min_ps(a: f32x4, b: f32x4) -> f32x4 {
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(maxss))]
pub fn _mm_max_ss(a: f32x4, b: f32x4) -> f32x4 {
unsafe { maxss(a, b) }
pub unsafe fn _mm_max_ss(a: f32x4, b: f32x4) -> f32x4 {
maxss(a, b)
}
/// Compare packed single-precision (32-bit) floating-point elements in `a` and
@@ -160,24 +160,23 @@ pub fn _mm_max_ss(a: f32x4, b: f32x4) -> f32x4 {
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(maxps))]
pub fn _mm_max_ps(a: f32x4, b: f32x4) -> f32x4 {
unsafe { maxps(a, b) }
pub unsafe fn _mm_max_ps(a: f32x4, b: f32x4) -> f32x4 {
maxps(a, b)
}
// Shuffle packed single-precision (32-bit) floating-point elements in `a` and `b`
// using `mask`.
// The lower half of result takes values from `a` and the higher half from `b`.
// Mask is split to 2 control bits each to index the element from inputs.
/// Shuffle packed single-precision (32-bit) floating-point elements in `a` and
/// `b` using `mask`.
///
/// The lower half of result takes values from `a` and the higher half from
/// `b`. Mask is split to 2 control bits each to index the element from inputs.
#[inline(always)]
#[target_feature = "+sse"]
pub fn _mm_shuffle_ps(a: f32x4, b: f32x4, mask: i32) -> f32x4 {
pub unsafe fn _mm_shuffle_ps(a: f32x4, b: f32x4, mask: i32) -> f32x4 {
let mask = (mask & 0xFF) as u8;
macro_rules! shuffle_done {
($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
unsafe {
simd_shuffle4(a, b, [$x01, $x23, $x45, $x67])
}
simd_shuffle4(a, b, [$x01, $x23, $x45, $x67])
}
}
macro_rules! shuffle_x67 {
@@ -219,10 +218,10 @@ macro_rules! shuffle_x23 {
}
#[cfg(test)]
#[cfg_attr(test, assert_instr(shufps))]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(shufps))]
fn _test_mm_shuffle_ps(a: f32x4, b: f32x4) -> f32x4 {
_mm_shuffle_ps(a, b, 3)
unsafe { _mm_shuffle_ps(a, b, 3) }
}
/// Unpack and interleave single-precision (32-bit) floating-point elements
@@ -230,8 +229,8 @@ fn _test_mm_shuffle_ps(a: f32x4, b: f32x4) -> f32x4 {
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(unpckhps))]
pub fn _mm_unpackhi_ps(a: f32x4, b: f32x4) -> f32x4 {
unsafe { simd_shuffle4(a, b, [2, 6, 3, 7]) }
pub unsafe fn _mm_unpackhi_ps(a: f32x4, b: f32x4) -> f32x4 {
simd_shuffle4(a, b, [2, 6, 3, 7])
}
/// Unpack and interleave single-precision (32-bit) floating-point elements
@@ -239,8 +238,8 @@ pub fn _mm_unpackhi_ps(a: f32x4, b: f32x4) -> f32x4 {
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(unpcklps))]
pub fn _mm_unpacklo_ps(a: f32x4, b: f32x4) -> f32x4 {
unsafe { simd_shuffle4(a, b, [0, 4, 1, 5]) }
pub unsafe fn _mm_unpacklo_ps(a: f32x4, b: f32x4) -> f32x4 {
simd_shuffle4(a, b, [0, 4, 1, 5])
}
/// Combine higher half of `a` and `b`. The highwe half of `b` occupies the lower
@@ -249,9 +248,9 @@ pub fn _mm_unpacklo_ps(a: f32x4, b: f32x4) -> f32x4 {
#[target_feature = "+sse"]
#[cfg_attr(all(test, not(windows)), assert_instr(movhlps))]
#[cfg_attr(all(test, windows), assert_instr(unpckhpd))]
pub fn _mm_movehl_ps(a: f32x4, b: f32x4) -> f32x4 {
pub unsafe fn _mm_movehl_ps(a: f32x4, b: f32x4) -> f32x4 {
// TODO; figure why this is a different instruction on Windows?
unsafe { simd_shuffle4(a, b, [6, 7, 2, 3]) }
simd_shuffle4(a, b, [6, 7, 2, 3])
}
/// Combine lower half of `a` and `b`. The lower half of `b` occupies the higher
@@ -259,8 +258,8 @@ pub fn _mm_movehl_ps(a: f32x4, b: f32x4) -> f32x4 {
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(unpcklpd))]
pub fn _mm_movelh_ps(a: f32x4, b: f32x4) -> f32x4 {
unsafe { simd_shuffle4(a, b, [0, 1, 4, 5]) }
pub unsafe fn _mm_movelh_ps(a: f32x4, b: f32x4) -> f32x4 {
simd_shuffle4(a, b, [0, 1, 4, 5])
}
/// Return a mask of the most significant bit of each element in `a`.
@@ -270,8 +269,8 @@ pub fn _mm_movelh_ps(a: f32x4, b: f32x4) -> f32x4 {
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(movmskps))]
pub fn _mm_movemask_ps(a: f32x4) -> i32 {
unsafe { movmskps(a) }
pub unsafe fn _mm_movemask_ps(a: f32x4) -> i32 {
movmskps(a)
}
#[allow(improper_ctypes)]
@@ -318,7 +317,7 @@ mod tests {
fn _mm_add_ps() {
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
let r = sse::_mm_add_ps(a, b);
let r = unsafe { sse::_mm_add_ps(a, b) };
assert_eq!(r, f32x4::new(-101.0, 25.0, 0.0, -15.0));
}
@@ -326,7 +325,7 @@ fn _mm_add_ps() {
fn _mm_add_ss() {
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
let r = sse::_mm_add_ss(a, b);
let r = unsafe { sse::_mm_add_ss(a, b) };
assert_eq!(r, f32x4::new(-101.0, 5.0, 0.0, -10.0));
}
@@ -334,7 +333,7 @@ fn _mm_add_ss() {
fn _mm_sub_ps() {
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
let r = sse::_mm_sub_ps(a, b);
let r = unsafe { sse::_mm_sub_ps(a, b) };
assert_eq!(r, f32x4::new(99.0, -15.0, 0.0, -5.0));
}
@@ -342,7 +341,7 @@ fn _mm_sub_ps() {
fn _mm_sub_ss() {
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
let r = sse::_mm_sub_ss(a, b);
let r = unsafe { sse::_mm_sub_ss(a, b) };
assert_eq!(r, f32x4::new(99.0, 5.0, 0.0, -10.0));
}
@@ -350,7 +349,7 @@ fn _mm_sub_ss() {
fn _mm_mul_ps() {
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
let r = sse::_mm_mul_ps(a, b);
let r = unsafe { sse::_mm_mul_ps(a, b) };
assert_eq!(r, f32x4::new(100.0, 100.0, 0.0, 50.0));
}
@@ -358,7 +357,7 @@ fn _mm_mul_ps() {
fn _mm_mul_ss() {
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
let r = sse::_mm_mul_ss(a, b);
let r = unsafe { sse::_mm_mul_ss(a, b) };
assert_eq!(r, f32x4::new(100.0, 5.0, 0.0, -10.0));
}
@@ -366,7 +365,7 @@ fn _mm_mul_ss() {
fn _mm_div_ps() {
let a = f32x4::new(-1.0, 5.0, 2.0, -10.0);
let b = f32x4::new(-100.0, 20.0, 0.2, -5.0);
let r = sse::_mm_div_ps(a, b);
let r = unsafe { sse::_mm_div_ps(a, b) };
assert_eq!(r, f32x4::new(0.01, 0.25, 10.0, 2.0));
}
@@ -374,14 +373,14 @@ fn _mm_div_ps() {
fn _mm_div_ss() {
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
let r = sse::_mm_div_ss(a, b);
let r = unsafe { sse::_mm_div_ss(a, b) };
assert_eq!(r, f32x4::new(0.01, 5.0, 0.0, -10.0));
}
#[simd_test = "sse"]
fn _mm_sqrt_ss() {
let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
let r = sse::_mm_sqrt_ss(a);
let r = unsafe { sse::_mm_sqrt_ss(a) };
let e = f32x4::new(2.0, 13.0, 16.0, 100.0);
assert_eq!(r, e);
}
@@ -389,7 +388,7 @@ fn _mm_sqrt_ss() {
#[simd_test = "sse"]
fn _mm_sqrt_ps() {
let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
let r = sse::_mm_sqrt_ps(a);
let r = unsafe { sse::_mm_sqrt_ps(a) };
let e = f32x4::new(2.0, 3.6055512, 4.0, 10.0);
assert_eq!(r, e);
}
@@ -397,7 +396,7 @@ fn _mm_sqrt_ps() {
#[simd_test = "sse"]
fn _mm_rcp_ss() {
let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
let r = sse::_mm_rcp_ss(a);
let r = unsafe { sse::_mm_rcp_ss(a) };
let e = f32x4::new(0.24993896, 13.0, 16.0, 100.0);
assert_eq!(r, e);
}
@@ -405,7 +404,7 @@ fn _mm_rcp_ss() {
#[simd_test = "sse"]
fn _mm_rcp_ps() {
let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
let r = sse::_mm_rcp_ps(a);
let r = unsafe { sse::_mm_rcp_ps(a) };
let e = f32x4::new(0.24993896, 0.0769043, 0.06248474, 0.0099983215);
assert_eq!(r, e);
}
@@ -413,7 +412,7 @@ fn _mm_rcp_ps() {
#[simd_test = "sse"]
fn _mm_rsqrt_ss() {
let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
let r = sse::_mm_rsqrt_ss(a);
let r = unsafe { sse::_mm_rsqrt_ss(a) };
let e = f32x4::new(0.49987793, 13.0, 16.0, 100.0);
assert_eq!(r, e);
}
@@ -421,7 +420,7 @@ fn _mm_rsqrt_ss() {
#[simd_test = "sse"]
fn _mm_rsqrt_ps() {
let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
let r = sse::_mm_rsqrt_ps(a);
let r = unsafe { sse::_mm_rsqrt_ps(a) };
let e = f32x4::new(0.49987793, 0.2772827, 0.24993896, 0.099990845);
assert_eq!(r, e);
}
@@ -430,7 +429,7 @@ fn _mm_rsqrt_ps() {
fn _mm_min_ss() {
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
let r = sse::_mm_min_ss(a, b);
let r = unsafe { sse::_mm_min_ss(a, b) };
assert_eq!(r, f32x4::new(-100.0, 5.0, 0.0, -10.0));
}
@@ -438,7 +437,7 @@ fn _mm_min_ss() {
fn _mm_min_ps() {
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
let r = sse::_mm_min_ps(a, b);
let r = unsafe { sse::_mm_min_ps(a, b) };
assert_eq!(r, f32x4::new(-100.0, 5.0, 0.0, -10.0));
}
@@ -446,7 +445,7 @@ fn _mm_min_ps() {
fn _mm_max_ss() {
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
let r = sse::_mm_max_ss(a, b);
let r = unsafe { sse::_mm_max_ss(a, b) };
assert_eq!(r, f32x4::new(-1.0, 5.0, 0.0, -10.0));
}
@@ -454,7 +453,7 @@ fn _mm_max_ss() {
fn _mm_max_ps() {
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
let r = sse::_mm_max_ps(a, b);
let r = unsafe { sse::_mm_max_ps(a, b) };
assert_eq!(r, f32x4::new(-1.0, 20.0, 0.0, -5.0));
}
@@ -463,7 +462,7 @@ fn _mm_shuffle_ps() {
let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
let b = f32x4::new(5.0, 6.0, 7.0, 8.0);
let mask = 0b00_01_01_11;
let r = sse::_mm_shuffle_ps(a, b, mask);
let r = unsafe { sse::_mm_shuffle_ps(a, b, mask) };
assert_eq!(r, f32x4::new(4.0, 2.0, 6.0, 5.0));
}
@@ -471,7 +470,7 @@ fn _mm_shuffle_ps() {
fn _mm_unpackhi_ps() {
let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
let b = f32x4::new(5.0, 6.0, 7.0, 8.0);
let r = sse::_mm_unpackhi_ps(a, b);
let r = unsafe { sse::_mm_unpackhi_ps(a, b) };
assert_eq!(r, f32x4::new(3.0, 7.0, 4.0, 8.0));
}
@@ -479,7 +478,7 @@ fn _mm_unpackhi_ps() {
fn _mm_unpacklo_ps() {
let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
let b = f32x4::new(5.0, 6.0, 7.0, 8.0);
let r = sse::_mm_unpacklo_ps(a, b);
let r = unsafe { sse::_mm_unpacklo_ps(a, b) };
assert_eq!(r, f32x4::new(1.0, 5.0, 2.0, 6.0));
}
@@ -487,7 +486,7 @@ fn _mm_unpacklo_ps() {
fn _mm_movehl_ps() {
let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
let b = f32x4::new(5.0, 6.0, 7.0, 8.0);
let r = sse::_mm_movehl_ps(a, b);
let r = unsafe { sse::_mm_movehl_ps(a, b) };
assert_eq!(r, f32x4::new(7.0, 8.0, 3.0, 4.0));
}
@@ -495,16 +494,20 @@ fn _mm_movehl_ps() {
fn _mm_movelh_ps() {
let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
let b = f32x4::new(5.0, 6.0, 7.0, 8.0);
let r = sse::_mm_movelh_ps(a, b);
let r = unsafe { sse::_mm_movelh_ps(a, b) };
assert_eq!(r, f32x4::new(1.0, 2.0, 5.0, 6.0));
}
#[simd_test = "sse"]
fn _mm_movemask_ps() {
let r = sse::_mm_movemask_ps(f32x4::new(-1.0, 5.0, -5.0, 0.0));
let r = unsafe {
sse::_mm_movemask_ps(f32x4::new(-1.0, 5.0, -5.0, 0.0))
};
assert_eq!(r, 0b0101);
let r = sse::_mm_movemask_ps(f32x4::new(-1.0, -5.0, -5.0, 0.0));
let r = unsafe {
sse::_mm_movemask_ps(f32x4::new(-1.0, -5.0, -5.0, 0.0))
};
assert_eq!(r, 0b0111);
}
}
+832 -546
View File
@@ -1,3 +1,6 @@
#[cfg(test)]
use stdsimd_test::assert_instr;
use std::mem;
use std::os::raw::c_void;
use std::ptr;
@@ -9,23 +12,22 @@
use v128::*;
use v64::*;
#[cfg(test)]
use stdsimd_test::assert_instr;
/// Provide a hint to the processor that the code sequence is a spin-wait loop.
///
/// This can help improve the performance and power consumption of spin-wait
/// loops.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_pause() {
unsafe { pause() }
#[cfg_attr(test, assert_instr(pause))]
pub unsafe fn _mm_pause() {
pause()
}
/// Invalidate and flush the cache line that contains `p` from all levels of
/// the cache hierarchy.
#[inline(always)]
#[target_feature = "+sse2"]
#[cfg_attr(test, assert_instr(clflush))]
pub unsafe fn _mm_clflush(p: *mut c_void) {
clflush(p)
}
@@ -38,8 +40,9 @@ pub unsafe fn _mm_clflush(p: *mut c_void) {
/// program order.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_lfence() {
unsafe { lfence() }
#[cfg_attr(test, assert_instr(lfence))]
pub unsafe fn _mm_lfence() {
lfence()
}
/// Perform a serializing operation on all load-from-memory and store-to-memory
@@ -50,79 +53,89 @@ pub fn _mm_lfence() {
/// which follows the fence in program order.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_mfence() {
unsafe { mfence() }
#[cfg_attr(test, assert_instr(mfence))]
pub unsafe fn _mm_mfence() {
mfence()
}
/// Add packed 8-bit integers in `a` and `b`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_add_epi8(a: i8x16, b: i8x16) -> i8x16 {
#[cfg_attr(test, assert_instr(paddb))]
pub unsafe fn _mm_add_epi8(a: i8x16, b: i8x16) -> i8x16 {
a + b
}
/// Add packed 16-bit integers in `a` and `b`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_add_epi16(a: i16x8, b: i16x8) -> i16x8 {
#[cfg_attr(test, assert_instr(paddw))]
pub unsafe fn _mm_add_epi16(a: i16x8, b: i16x8) -> i16x8 {
a + b
}
/// Add packed 32-bit integers in `a` and `b`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_add_epi32(a: i32x4, b: i32x4) -> i32x4 {
#[cfg_attr(test, assert_instr(paddd))]
pub unsafe fn _mm_add_epi32(a: i32x4, b: i32x4) -> i32x4 {
a + b
}
/// Add packed 64-bit integers in `a` and "b`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_add_epi64(a: i64x2, b: i64x2) -> i64x2 {
#[cfg_attr(test, assert_instr(paddq))]
pub unsafe fn _mm_add_epi64(a: i64x2, b: i64x2) -> i64x2 {
a + b
}
/// Add packed 8-bit integers in `a` and `b` using saturation.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_adds_epi8(a: i8x16, b: i8x16) -> i8x16 {
unsafe { paddsb(a, b) }
#[cfg_attr(test, assert_instr(paddsb))]
pub unsafe fn _mm_adds_epi8(a: i8x16, b: i8x16) -> i8x16 {
paddsb(a, b)
}
/// Add packed 16-bit integers in `a` and `b` using saturation.
#[inline(always)]
#[target_feature = "+sse2"]
#[cfg_attr(test, assert_instr(paddsw))]
pub fn _mm_adds_epi16(a: i16x8, b: i16x8) -> i16x8 {
unsafe { paddsw(a, b) }
pub unsafe fn _mm_adds_epi16(a: i16x8, b: i16x8) -> i16x8 {
paddsw(a, b)
}
/// Add packed unsigned 8-bit integers in `a` and `b` using saturation.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_adds_epu8(a: u8x16, b: u8x16) -> u8x16 {
unsafe { paddsub(a, b) }
#[cfg_attr(test, assert_instr(paddusb))]
pub unsafe fn _mm_adds_epu8(a: u8x16, b: u8x16) -> u8x16 {
paddsub(a, b)
}
/// Add packed unsigned 16-bit integers in `a` and `b` using saturation.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_adds_epu16(a: u16x8, b: u16x8) -> u16x8 {
unsafe { paddsuw(a, b) }
#[cfg_attr(test, assert_instr(paddusw))]
pub unsafe fn _mm_adds_epu16(a: u16x8, b: u16x8) -> u16x8 {
paddsuw(a, b)
}
/// Average packed unsigned 8-bit integers in `a` and `b`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_avg_epu8(a: u8x16, b: u8x16) -> u8x16 {
unsafe { pavgb(a, b) }
#[cfg_attr(test, assert_instr(pavgb))]
pub unsafe fn _mm_avg_epu8(a: u8x16, b: u8x16) -> u8x16 {
pavgb(a, b)
}
/// Average packed unsigned 16-bit integers in `a` and `b`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_avg_epu16(a: u16x8, b: u16x8) -> u16x8 {
unsafe { pavgw(a, b) }
#[cfg_attr(test, assert_instr(pavgw))]
pub unsafe fn _mm_avg_epu16(a: u16x8, b: u16x8) -> u16x8 {
pavgw(a, b)
}
/// Multiply and then horizontally add signed 16 bit integers in `a` and `b`.
@@ -132,40 +145,45 @@ pub fn _mm_avg_epu16(a: u16x8, b: u16x8) -> u16x8 {
/// intermediate 32-bit integers.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_madd_epi16(a: i16x8, b: i16x8) -> i32x4 {
unsafe { pmaddwd(a, b) }
#[cfg_attr(test, assert_instr(pmaddwd))]
pub unsafe fn _mm_madd_epi16(a: i16x8, b: i16x8) -> i32x4 {
pmaddwd(a, b)
}
/// Compare packed 16-bit integers in `a` and `b`, and return the packed
/// maximum values.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_max_epi16(a: i16x8, b: i16x8) -> i16x8 {
unsafe { pmaxsw(a, b) }
#[cfg_attr(test, assert_instr(pmaxsw))]
pub unsafe fn _mm_max_epi16(a: i16x8, b: i16x8) -> i16x8 {
pmaxsw(a, b)
}
/// Compare packed unsigned 8-bit integers in `a` and `b`, and return the
/// packed maximum values.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_max_epu8(a: u8x16, b: u8x16) -> u8x16 {
unsafe { pmaxub(a, b) }
#[cfg_attr(test, assert_instr(pmaxub))]
pub unsafe fn _mm_max_epu8(a: u8x16, b: u8x16) -> u8x16 {
pmaxub(a, b)
}
/// Compare packed 16-bit integers in `a` and `b`, and return the packed
/// minimum values.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_min_epi16(a: i16x8, b: i16x8) -> i16x8 {
unsafe { pminsw(a, b) }
#[cfg_attr(test, assert_instr(pminsw))]
pub unsafe fn _mm_min_epi16(a: i16x8, b: i16x8) -> i16x8 {
pminsw(a, b)
}
/// Compare packed unsigned 8-bit integers in `a` and `b`, and return the
/// packed minimum values.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_min_epu8(a: u8x16, b: u8x16) -> u8x16 {
unsafe { pminub(a, b) }
#[cfg_attr(test, assert_instr(pminub))]
pub unsafe fn _mm_min_epu8(a: u8x16, b: u8x16) -> u8x16 {
pminub(a, b)
}
/// Multiply the packed 16-bit integers in `a` and `b`.
@@ -174,8 +192,9 @@ pub fn _mm_min_epu8(a: u8x16, b: u8x16) -> u8x16 {
/// high 16 bits of the intermediate integers.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_mulhi_epi16(a: i16x8, b: i16x8) -> i16x8 {
unsafe { pmulhw(a, b) }
#[cfg_attr(test, assert_instr(pmulhw))]
pub unsafe fn _mm_mulhi_epi16(a: i16x8, b: i16x8) -> i16x8 {
pmulhw(a, b)
}
/// Multiply the packed unsigned 16-bit integers in `a` and `b`.
@@ -184,8 +203,9 @@ pub fn _mm_mulhi_epi16(a: i16x8, b: i16x8) -> i16x8 {
/// high 16 bits of the intermediate integers.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_mulhi_epu16(a: u16x8, b: u16x8) -> u16x8 {
unsafe { pmulhuw(a, b) }
#[cfg_attr(test, assert_instr(pmulhuw))]
pub unsafe fn _mm_mulhi_epu16(a: u16x8, b: u16x8) -> u16x8 {
pmulhuw(a, b)
}
/// Multiply the packed 16-bit integers in `a` and `b`.
@@ -194,7 +214,8 @@ pub fn _mm_mulhi_epu16(a: u16x8, b: u16x8) -> u16x8 {
/// low 16 bits of the intermediate integers.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_mullo_epi16(a: i16x8, b: i16x8) -> i16x8 {
#[cfg_attr(test, assert_instr(pmullw))]
pub unsafe fn _mm_mullo_epi16(a: i16x8, b: i16x8) -> i16x8 {
a * b
}
@@ -204,8 +225,9 @@ pub fn _mm_mullo_epi16(a: i16x8, b: i16x8) -> i16x8 {
/// Return the unsigned 64-bit results.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_mul_epu32(a: u32x4, b: u32x4) -> u64x2 {
unsafe { pmuludq(a, b) }
#[cfg_attr(test, assert_instr(pmuludq))]
pub unsafe fn _mm_mul_epu32(a: u32x4, b: u32x4) -> u64x2 {
pmuludq(a, b)
}
/// Sum the absolute differences of packed unsigned 8-bit integers.
@@ -216,35 +238,40 @@ pub fn _mm_mul_epu32(a: u32x4, b: u32x4) -> u64x2 {
/// the low 16 bits of 64-bit elements returned.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_sad_epu8(a: u8x16, b: u8x16) -> u64x2 {
unsafe { psadbw(a, b) }
#[cfg_attr(test, assert_instr(psadbw))]
pub unsafe fn _mm_sad_epu8(a: u8x16, b: u8x16) -> u64x2 {
psadbw(a, b)
}
/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_sub_epi8(a: i8x16, b: i8x16) -> i8x16 {
#[cfg_attr(test, assert_instr(psubb))]
pub unsafe fn _mm_sub_epi8(a: i8x16, b: i8x16) -> i8x16 {
a - b
}
/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_sub_epi16(a: i16x8, b: i16x8) -> i16x8 {
#[cfg_attr(test, assert_instr(psubw))]
pub unsafe fn _mm_sub_epi16(a: i16x8, b: i16x8) -> i16x8 {
a - b
}
/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_sub_epi32(a: i32x4, b: i32x4) -> i32x4 {
#[cfg_attr(test, assert_instr(psubd))]
pub unsafe fn _mm_sub_epi32(a: i32x4, b: i32x4) -> i32x4 {
a - b
}
/// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_sub_epi64(a: i64x2, b: i64x2) -> i64x2 {
#[cfg_attr(test, assert_instr(psubq))]
pub unsafe fn _mm_sub_epi64(a: i64x2, b: i64x2) -> i64x2 {
a - b
}
@@ -252,54 +279,56 @@ pub fn _mm_sub_epi64(a: i64x2, b: i64x2) -> i64x2 {
/// using saturation.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_subs_epi8(a: i8x16, b: i8x16) -> i8x16 {
unsafe { psubsb(a, b) }
#[cfg_attr(test, assert_instr(psubsb))]
pub unsafe fn _mm_subs_epi8(a: i8x16, b: i8x16) -> i8x16 {
psubsb(a, b)
}
/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
/// using saturation.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_subs_epi16(a: i16x8, b: i16x8) -> i16x8 {
unsafe { psubsw(a, b) }
#[cfg_attr(test, assert_instr(psubsw))]
pub unsafe fn _mm_subs_epi16(a: i16x8, b: i16x8) -> i16x8 {
psubsw(a, b)
}
/// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit
/// integers in `a` using saturation.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_subs_epu8(a: u8x16, b: u8x16) -> u8x16 {
unsafe { psubusb(a, b) }
#[cfg_attr(test, assert_instr(psubusb))]
pub unsafe fn _mm_subs_epu8(a: u8x16, b: u8x16) -> u8x16 {
psubusb(a, b)
}
/// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit
/// integers in `a` using saturation.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_subs_epu16(a: u16x8, b: u16x8) -> u16x8 {
unsafe { psubusw(a, b) }
#[cfg_attr(test, assert_instr(psubusw))]
pub unsafe fn _mm_subs_epu16(a: u16x8, b: u16x8) -> u16x8 {
psubusw(a, b)
}
/// Shift `a` left by `imm8` bytes while shifting in zeros.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_slli_si128(a: __m128i, imm8: i32) -> __m128i {
pub unsafe fn _mm_slli_si128(a: __m128i, imm8: i32) -> __m128i {
let (zero, imm8) = (__m128i::splat(0), imm8 as u32);
const fn sub(a: u32, b: u32) -> u32 { a - b }
macro_rules! shuffle {
($shift:expr) => {
unsafe {
simd_shuffle16::<__m128i, __m128i>(zero, a, [
sub(16, $shift), sub(17, $shift),
sub(18, $shift), sub(19, $shift),
sub(20, $shift), sub(21, $shift),
sub(22, $shift), sub(23, $shift),
sub(24, $shift), sub(25, $shift),
sub(26, $shift), sub(27, $shift),
sub(28, $shift), sub(29, $shift),
sub(30, $shift), sub(31, $shift),
])
}
simd_shuffle16::<__m128i, __m128i>(zero, a, [
sub(16, $shift), sub(17, $shift),
sub(18, $shift), sub(19, $shift),
sub(20, $shift), sub(21, $shift),
sub(22, $shift), sub(23, $shift),
sub(24, $shift), sub(25, $shift),
sub(26, $shift), sub(27, $shift),
sub(28, $shift), sub(29, $shift),
sub(30, $shift), sub(31, $shift),
])
}
}
match imm8 {
@@ -315,117 +344,146 @@ macro_rules! shuffle {
}
}
#[cfg(test)]
#[target_feature = "+sse2"]
#[cfg_attr(test, assert_instr(pslldq))]
fn _test_mm_slli_si128(a: __m128i) -> __m128i {
unsafe { _mm_slli_si128(a, 1) }
}
/// Shift `a` left by `imm8` bytes while shifting in zeros.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_bslli_si128(a: __m128i, imm8: i32) -> __m128i {
pub unsafe fn _mm_bslli_si128(a: __m128i, imm8: i32) -> __m128i {
_mm_slli_si128(a, imm8)
}
#[cfg(test)]
#[target_feature = "+sse2"]
#[cfg_attr(test, assert_instr(pslldq))]
fn _test_mm_bslli_si128(a: __m128i) -> __m128i {
unsafe { _mm_bslli_si128(a, 1) }
}
/// Shift `a` right by `imm8` bytes while shifting in zeros.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_bsrli_si128(a: __m128i, imm8: i32) -> __m128i {
pub unsafe fn _mm_bsrli_si128(a: __m128i, imm8: i32) -> __m128i {
_mm_srli_si128(a, imm8)
}
#[cfg(test)]
#[target_feature = "+sse2"]
#[cfg_attr(test, assert_instr(psrldq))]
fn _test_mm_bsrli_si128(a: __m128i) -> __m128i {
unsafe { _mm_bsrli_si128(a, 1) }
}
/// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_slli_epi16(a: i16x8, imm8: i32) -> i16x8 {
unsafe { pslliw(a, imm8) }
#[cfg_attr(test, assert_instr(psllw))]
pub unsafe fn _mm_slli_epi16(a: i16x8, imm8: i32) -> i16x8 {
pslliw(a, imm8)
}
/// Shift packed 16-bit integers in `a` left by `count` while shifting in
/// zeros.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_sll_epi16(a: i16x8, count: i16x8) -> i16x8 {
unsafe { psllw(a, count) }
#[cfg_attr(test, assert_instr(psllw))]
pub unsafe fn _mm_sll_epi16(a: i16x8, count: i16x8) -> i16x8 {
psllw(a, count)
}
/// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_slli_epi32(a: i32x4, imm8: i32) -> i32x4 {
unsafe { psllid(a, imm8) }
#[cfg_attr(test, assert_instr(pslld))]
pub unsafe fn _mm_slli_epi32(a: i32x4, imm8: i32) -> i32x4 {
psllid(a, imm8)
}
/// Shift packed 32-bit integers in `a` left by `count` while shifting in
/// zeros.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_sll_epi32(a: i32x4, count: i32x4) -> i32x4 {
unsafe { pslld(a, count) }
#[cfg_attr(test, assert_instr(pslld))]
pub unsafe fn _mm_sll_epi32(a: i32x4, count: i32x4) -> i32x4 {
pslld(a, count)
}
/// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_slli_epi64(a: i64x2, imm8: i32) -> i64x2 {
unsafe { pslliq(a, imm8) }
#[cfg_attr(test, assert_instr(psllq))]
pub unsafe fn _mm_slli_epi64(a: i64x2, imm8: i32) -> i64x2 {
pslliq(a, imm8)
}
/// Shift packed 64-bit integers in `a` left by `count` while shifting in
/// zeros.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_sll_epi64(a: i64x2, count: i64x2) -> i64x2 {
unsafe { psllq(a, count) }
#[cfg_attr(test, assert_instr(psllq))]
pub unsafe fn _mm_sll_epi64(a: i64x2, count: i64x2) -> i64x2 {
psllq(a, count)
}
/// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign
/// bits.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_srai_epi16(a: i16x8, imm8: i32) -> i16x8 {
unsafe { psraiw(a, imm8) }
#[cfg_attr(test, assert_instr(psraw))]
pub unsafe fn _mm_srai_epi16(a: i16x8, imm8: i32) -> i16x8 {
psraiw(a, imm8)
}
/// Shift packed 16-bit integers in `a` right by `count` while shifting in sign
/// bits.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_sra_epi16(a: i16x8, count: i16x8) -> i16x8 {
unsafe { psraw(a, count) }
#[cfg_attr(test, assert_instr(psraw))]
pub unsafe fn _mm_sra_epi16(a: i16x8, count: i16x8) -> i16x8 {
psraw(a, count)
}
/// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign
/// bits.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_srai_epi32(a: i32x4, imm8: i32) -> i32x4 {
unsafe { psraid(a, imm8) }
#[cfg_attr(test, assert_instr(psrad))]
pub unsafe fn _mm_srai_epi32(a: i32x4, imm8: i32) -> i32x4 {
psraid(a, imm8)
}
/// Shift packed 32-bit integers in `a` right by `count` while shifting in sign
/// bits.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_sra_epi32(a: i32x4, count: i32x4) -> i32x4 {
unsafe { psrad(a, count) }
#[cfg_attr(test, assert_instr(psrad))]
pub unsafe fn _mm_sra_epi32(a: i32x4, count: i32x4) -> i32x4 {
psrad(a, count)
}
/// Shift `a` right by `imm8` bytes while shifting in zeros.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_srli_si128(a: __m128i, imm8: i32) -> __m128i {
pub unsafe fn _mm_srli_si128(a: __m128i, imm8: i32) -> __m128i {
let (zero, imm8) = (__m128i::splat(0), imm8 as u32);
const fn add(a: u32, b: u32) -> u32 { a + b }
macro_rules! shuffle {
($shift:expr) => {
unsafe {
simd_shuffle16::<__m128i, __m128i>(a, zero, [
add(0, $shift), add(1, $shift),
add(2, $shift), add(3, $shift),
add(4, $shift), add(5, $shift),
add(6, $shift), add(7, $shift),
add(8, $shift), add(9, $shift),
add(10, $shift), add(11, $shift),
add(12, $shift), add(13, $shift),
add(14, $shift), add(15, $shift),
])
}
simd_shuffle16::<__m128i, __m128i>(a, zero, [
add(0, $shift), add(1, $shift),
add(2, $shift), add(3, $shift),
add(4, $shift), add(5, $shift),
add(6, $shift), add(7, $shift),
add(8, $shift), add(9, $shift),
add(10, $shift), add(11, $shift),
add(12, $shift), add(13, $shift),
add(14, $shift), add(15, $shift),
])
}
}
match imm8 {
@@ -441,59 +499,73 @@ macro_rules! shuffle {
}
}
#[cfg(test)]
#[target_feature = "+sse2"]
#[cfg_attr(test, assert_instr(psrldq))]
fn _test_mm_srli_si128(a: __m128i) -> __m128i {
unsafe { _mm_srli_si128(a, 1) }
}
/// Shift packed 16-bit integers in `a` right by `imm8` while shifting in
/// zeros.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_srli_epi16(a: i16x8, imm8: i32) -> i16x8 {
unsafe { psrliw(a, imm8) }
#[cfg_attr(test, assert_instr(psrlw))]
pub unsafe fn _mm_srli_epi16(a: i16x8, imm8: i32) -> i16x8 {
psrliw(a, imm8)
}
/// Shift packed 16-bit integers in `a` right by `count` while shifting in
/// zeros.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_srl_epi16(a: i16x8, count: i16x8) -> i16x8 {
unsafe { psrlw(a, count) }
#[cfg_attr(test, assert_instr(psrlw))]
pub unsafe fn _mm_srl_epi16(a: i16x8, count: i16x8) -> i16x8 {
psrlw(a, count)
}
/// Shift packed 32-bit integers in `a` right by `imm8` while shifting in
/// zeros.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_srli_epi32(a: i32x4, imm8: i32) -> i32x4 {
unsafe { psrlid(a, imm8) }
#[cfg_attr(test, assert_instr(psrld))]
pub unsafe fn _mm_srli_epi32(a: i32x4, imm8: i32) -> i32x4 {
psrlid(a, imm8)
}
/// Shift packed 32-bit integers in `a` right by `count` while shifting in
/// zeros.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_srl_epi32(a: i32x4, count: i32x4) -> i32x4 {
unsafe { psrld(a, count) }
#[cfg_attr(test, assert_instr(psrld))]
pub unsafe fn _mm_srl_epi32(a: i32x4, count: i32x4) -> i32x4 {
psrld(a, count)
}
/// Shift packed 64-bit integers in `a` right by `imm8` while shifting in
/// zeros.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_srli_epi64(a: i64x2, imm8: i32) -> i64x2 {
unsafe { psrliq(a, imm8) }
#[cfg_attr(test, assert_instr(psrlq))]
pub unsafe fn _mm_srli_epi64(a: i64x2, imm8: i32) -> i64x2 {
psrliq(a, imm8)
}
/// Shift packed 64-bit integers in `a` right by `count` while shifting in
/// zeros.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_srl_epi64(a: i64x2, count: i64x2) -> i64x2 {
unsafe { psrlq(a, count) }
#[cfg_attr(test, assert_instr(psrlq))]
pub unsafe fn _mm_srl_epi64(a: i64x2, count: i64x2) -> i64x2 {
psrlq(a, count)
}
/// Compute the bitwise AND of 128 bits (representing integer data) in `a` and
/// `b`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_and_si128(a: __m128i, b: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(andps))]
pub unsafe fn _mm_and_si128(a: __m128i, b: __m128i) -> __m128i {
a & b
}
@@ -501,7 +573,8 @@ pub fn _mm_and_si128(a: __m128i, b: __m128i) -> __m128i {
/// then AND with `b`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_andnot_si128(a: __m128i, b: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(andnps))]
pub unsafe fn _mm_andnot_si128(a: __m128i, b: __m128i) -> __m128i {
(!a) & b
}
@@ -509,7 +582,8 @@ pub fn _mm_andnot_si128(a: __m128i, b: __m128i) -> __m128i {
/// `b`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_or_si128(a: __m128i, b: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(orps))]
pub unsafe fn _mm_or_si128(a: __m128i, b: __m128i) -> __m128i {
a | b
}
@@ -517,70 +591,80 @@ pub fn _mm_or_si128(a: __m128i, b: __m128i) -> __m128i {
/// `b`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_xor_si128(a: __m128i, b: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(xorps))]
pub unsafe fn _mm_xor_si128(a: __m128i, b: __m128i) -> __m128i {
a ^ b
}
/// Compare packed 8-bit integers in `a` and `b` for equality.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmpeq_epi8(a: i8x16, b: i8x16) -> i8x16 {
#[cfg_attr(test, assert_instr(pcmpeqb))]
pub unsafe fn _mm_cmpeq_epi8(a: i8x16, b: i8x16) -> i8x16 {
a.eq(b)
}
/// Compare packed 16-bit integers in `a` and `b` for equality.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmpeq_epi16(a: i16x8, b: i16x8) -> i16x8 {
#[cfg_attr(test, assert_instr(pcmpeqw))]
pub unsafe fn _mm_cmpeq_epi16(a: i16x8, b: i16x8) -> i16x8 {
a.eq(b)
}
/// Compare packed 32-bit integers in `a` and `b` for equality.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmpeq_epi32(a: i32x4, b: i32x4) -> i32x4 {
#[cfg_attr(test, assert_instr(pcmpeqd))]
pub unsafe fn _mm_cmpeq_epi32(a: i32x4, b: i32x4) -> i32x4 {
a.eq(b)
}
/// Compare packed 8-bit integers in `a` and `b` for greater-than.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmpgt_epi8(a: i8x16, b: i8x16) -> i8x16 {
#[cfg_attr(test, assert_instr(pcmpgtb))]
pub unsafe fn _mm_cmpgt_epi8(a: i8x16, b: i8x16) -> i8x16 {
a.gt(b)
}
/// Compare packed 16-bit integers in `a` and `b` for greater-than.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmpgt_epi16(a: i16x8, b: i16x8) -> i16x8 {
#[cfg_attr(test, assert_instr(pcmpgtw))]
pub unsafe fn _mm_cmpgt_epi16(a: i16x8, b: i16x8) -> i16x8 {
a.gt(b)
}
/// Compare packed 32-bit integers in `a` and `b` for greater-than.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmpgt_epi32(a: i32x4, b: i32x4) -> i32x4 {
#[cfg_attr(test, assert_instr(pcmpgtd))]
pub unsafe fn _mm_cmpgt_epi32(a: i32x4, b: i32x4) -> i32x4 {
a.gt(b)
}
/// Compare packed 8-bit integers in `a` and `b` for less-than.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmplt_epi8(a: i8x16, b: i8x16) -> i8x16 {
#[cfg_attr(test, assert_instr(pcmpgtb))]
pub unsafe fn _mm_cmplt_epi8(a: i8x16, b: i8x16) -> i8x16 {
a.lt(b)
}
/// Compare packed 16-bit integers in `a` and `b` for less-than.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmplt_epi16(a: i16x8, b: i16x8) -> i16x8 {
#[cfg_attr(test, assert_instr(pcmpgtw))]
pub unsafe fn _mm_cmplt_epi16(a: i16x8, b: i16x8) -> i16x8 {
a.lt(b)
}
/// Compare packed 32-bit integers in `a` and `b` for less-than.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmplt_epi32(a: i32x4, b: i32x4) -> i32x4 {
#[cfg_attr(test, assert_instr(pcmpgtd))]
pub unsafe fn _mm_cmplt_epi32(a: i32x4, b: i32x4) -> i32x4 {
a.lt(b)
}
@@ -588,31 +672,37 @@ pub fn _mm_cmplt_epi32(a: i32x4, b: i32x4) -> i32x4 {
/// double-precision (64-bit) floating-point elements.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cvtepi32_pd(a: i32x4) -> f64x2 {
unsafe { simd_cast::<i32x2, f64x2>(simd_shuffle2(a, a, [0, 1])) }
#[cfg_attr(test, assert_instr(cvtdq2pd))]
pub unsafe fn _mm_cvtepi32_pd(a: i32x4) -> f64x2 {
simd_cast::<i32x2, f64x2>(simd_shuffle2(a, a, [0, 1]))
}
/// Return `a` with its lower element replaced by `b` after converting it to
/// an `f64`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cvtsi32_sd(a: f64x2, b: i32) -> f64x2 {
#[cfg_attr(test, assert_instr(cvtsi2sd))]
pub unsafe fn _mm_cvtsi32_sd(a: f64x2, b: i32) -> f64x2 {
a.replace(0, b as f64)
}
/// Return `a` with its lower element replaced by `b` after converting it to
/// an `f64`.
#[cfg(target_arch = "x86_64")]
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cvtsi64_sd(a: f64x2, b: i64) -> f64x2 {
#[cfg_attr(test, assert_instr(cvtsi2sd))]
pub unsafe fn _mm_cvtsi64_sd(a: f64x2, b: i64) -> f64x2 {
a.replace(0, b as f64)
}
/// Return `a` with its lower element replaced by `b` after converting it to
/// an `f64`.
#[cfg(target_arch = "x86_64")]
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cvtsi64x_sd(a: f64x2, b: i64) -> f64x2 {
#[cfg_attr(test, assert_instr(cvtsi2sd))]
pub unsafe fn _mm_cvtsi64x_sd(a: f64x2, b: i64) -> f64x2 {
_mm_cvtsi64_sd(a, b)
}
@@ -620,52 +710,63 @@ pub fn _mm_cvtsi64x_sd(a: f64x2, b: i64) -> f64x2 {
/// floating-point elements.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cvtepi32_ps(a: i32x4) -> f32x4 {
unsafe { cvtdq2ps(a) }
#[cfg_attr(test, assert_instr(cvtdq2ps))]
pub unsafe fn _mm_cvtepi32_ps(a: i32x4) -> f32x4 {
cvtdq2ps(a)
}
/// Return a vector whose lowest element is `a` and all higher elements are
/// `0`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cvtsi32_si128(a: i32) -> i32x4 {
// no particular instruction to test
pub unsafe fn _mm_cvtsi32_si128(a: i32) -> i32x4 {
i32x4::new(a, 0, 0, 0)
}
/// Return a vector whose lowest element is `a` and all higher elements are
/// `0`.
#[cfg(target_arch = "x86_64")]
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cvtsi64_si128(a: i64) -> i64x2 {
// no particular instruction to test
pub unsafe fn _mm_cvtsi64_si128(a: i64) -> i64x2 {
i64x2::new(a, 0)
}
/// Return a vector whose lowest element is `a` and all higher elements are
/// `0`.
#[cfg(target_arch = "x86_64")]
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cvtsi64x_si128(a: i64) -> i64x2 {
// no particular instruction to test
pub unsafe fn _mm_cvtsi64x_si128(a: i64) -> i64x2 {
_mm_cvtsi64_si128(a)
}
/// Return the lowest element of `a`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cvtsi128_si32(a: i32x4) -> i32 {
// no particular instruction to test
pub unsafe fn _mm_cvtsi128_si32(a: i32x4) -> i32 {
a.extract(0)
}
/// Return the lowest element of `a`.
#[cfg(target_arch = "x86_64")]
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cvtsi128_si64(a: i64x2) -> i64 {
// no particular instruction to test
pub unsafe fn _mm_cvtsi128_si64(a: i64x2) -> i64 {
a.extract(0)
}
/// Return the lowest element of `a`.
#[cfg(target_arch = "x86_64")]
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cvtsi128_si64x(a: i64x2) -> i64 {
// no particular instruction to test
pub unsafe fn _mm_cvtsi128_si64x(a: i64x2) -> i64 {
_mm_cvtsi128_si64(a)
}
@@ -673,21 +774,24 @@ pub fn _mm_cvtsi128_si64x(a: i64x2) -> i64 {
/// lowest.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_set_epi64x(e1: i64, e0: i64) -> i64x2 {
// no particular instruction to test
pub unsafe fn _mm_set_epi64x(e1: i64, e0: i64) -> i64x2 {
i64x2::new(e0, e1)
}
/// Set packed 32-bit integers with the supplied values.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> i32x4 {
// no particular instruction to test
pub unsafe fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> i32x4 {
i32x4::new(e0, e1, e2, e3)
}
/// Set packed 16-bit integers with the supplied values.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_set_epi16(
// no particular instruction to test
pub unsafe fn _mm_set_epi16(
e7: i16, e6: i16, e5: i16, e4: i16, e3: i16, e2: i16, e1: i16, e0: i16,
) -> i16x8 {
i16x8::new(e0, e1, e2, e3, e4, e5, e6, e7)
@@ -696,7 +800,8 @@ pub fn _mm_set_epi16(
/// Set packed 8-bit integers with the supplied values.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_set_epi8(
// no particular instruction to test
pub unsafe fn _mm_set_epi8(
e15: i8, e14: i8, e13: i8, e12: i8, e11: i8, e10: i8, e9: i8, e8: i8,
e7: i8, e6: i8, e5: i8, e4: i8, e3: i8, e2: i8, e1: i8, e0: i8,
) -> i8x16 {
@@ -708,42 +813,48 @@ pub fn _mm_set_epi8(
/// Broadcast 64-bit integer `a` to all elements.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_set1_epi64x(a: i64) -> i64x2 {
// no particular instruction to test
pub unsafe fn _mm_set1_epi64x(a: i64) -> i64x2 {
i64x2::splat(a)
}
/// Broadcast 32-bit integer `a` to all elements.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_set1_epi32(a: i32) -> i32x4 {
// no particular instruction to test
pub unsafe fn _mm_set1_epi32(a: i32) -> i32x4 {
i32x4::splat(a)
}
/// Broadcast 16-bit integer `a` to all elements.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_set1_epi16(a: i16) -> i16x8 {
// no particular instruction to test
pub unsafe fn _mm_set1_epi16(a: i16) -> i16x8 {
i16x8::splat(a)
}
/// Broadcast 8-bit integer `a` to all elements.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_set1_epi8(a: i8) -> i8x16 {
// no particular instruction to test
pub unsafe fn _mm_set1_epi8(a: i8) -> i8x16 {
i8x16::splat(a)
}
/// Set packed 32-bit integers with the supplied values in reverse order.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> i32x4 {
// no particular instruction to test
pub unsafe fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> i32x4 {
i32x4::new(e3, e2, e1, e0)
}
/// Set packed 16-bit integers with the supplied values in reverse order.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_setr_epi16(
// no particular instruction to test
pub unsafe fn _mm_setr_epi16(
e7: i16, e6: i16, e5: i16, e4: i16, e3: i16, e2: i16, e1: i16, e0: i16,
) -> i16x8 {
i16x8::new(e7, e6, e5, e4, e3, e2, e1, e0)
@@ -752,7 +863,8 @@ pub fn _mm_setr_epi16(
/// Set packed 8-bit integers with the supplied values in reverse order.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_setr_epi8(
// no particular instruction to test
pub unsafe fn _mm_setr_epi8(
e15: i8, e14: i8, e13: i8, e12: i8, e11: i8, e10: i8, e9: i8, e8: i8,
e7: i8, e6: i8, e5: i8, e4: i8, e3: i8, e2: i8, e1: i8, e0: i8,
) -> i8x16 {
@@ -764,13 +876,15 @@ pub fn _mm_setr_epi8(
/// Returns a vector with all elements set to zero.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_setzero_si128() -> __m128i {
#[cfg_attr(test, assert_instr(xorps))]
pub unsafe fn _mm_setzero_si128() -> __m128i {
__m128i::splat(0)
}
/// Load 64-bit integer from memory into first element of returned vector.
#[inline(always)]
#[target_feature = "+sse2"]
// no particular instruction to test
pub unsafe fn _mm_loadl_epi64(mem_addr: *const i64x2) -> i64x2 {
i64x2::new((*mem_addr).extract(0), 0)
}
@@ -780,6 +894,7 @@ pub unsafe fn _mm_loadl_epi64(mem_addr: *const i64x2) -> i64x2 {
/// `mem_addr` must be aligned on a 16-byte boundary.
#[inline(always)]
#[target_feature = "+sse2"]
#[cfg_attr(test, assert_instr(movaps))]
pub unsafe fn _mm_load_si128(mem_addr: *const __m128i) -> __m128i {
*mem_addr
}
@@ -789,6 +904,7 @@ pub unsafe fn _mm_load_si128(mem_addr: *const __m128i) -> __m128i {
/// `mem_addr` does not need to be aligned on any particular boundary.
#[inline(always)]
#[target_feature = "+sse2"]
#[cfg_attr(test, assert_instr(movups))]
pub unsafe fn _mm_loadu_si128(mem_addr: *const __m128i) -> __m128i {
let mut dst = mem::uninitialized();
ptr::copy_nonoverlapping(
@@ -808,6 +924,7 @@ pub unsafe fn _mm_loadu_si128(mem_addr: *const __m128i) -> __m128i {
/// to be aligned on any particular boundary.
#[inline(always)]
#[target_feature = "+sse2"]
#[cfg_attr(test, assert_instr(maskmovdqu))]
pub unsafe fn _mm_maskmoveu_si128(a: i8x16, mask: i8x16, mem_addr: *mut i8) {
maskmovdqu(a, mask, mem_addr)
}
@@ -817,6 +934,7 @@ pub unsafe fn _mm_maskmoveu_si128(a: i8x16, mask: i8x16, mem_addr: *mut i8) {
/// `mem_addr` must be aligned on a 16-byte boundary.
#[inline(always)]
#[target_feature = "+sse2"]
#[cfg_attr(test, assert_instr(movaps))]
pub unsafe fn _mm_store_si128(mem_addr: *mut __m128i, a: __m128i) {
*mem_addr = a;
}
@@ -826,6 +944,7 @@ pub unsafe fn _mm_store_si128(mem_addr: *mut __m128i, a: __m128i) {
/// `mem_addr` does not need to be aligned on any particular boundary.
#[inline(always)]
#[target_feature = "+sse2"]
#[cfg_attr(test, assert_instr(movups))]
pub unsafe fn _mm_storeu_si128(mem_addr: *mut __m128i, a: __m128i) {
ptr::copy_nonoverlapping(
&a as *const _ as *const u8,
@@ -838,6 +957,7 @@ pub unsafe fn _mm_storeu_si128(mem_addr: *mut __m128i, a: __m128i) {
/// `mem_addr` does not need to be aligned on any particular boundary.
#[inline(always)]
#[target_feature = "+sse2"]
// no particular instruction to test
pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) {
ptr::copy_nonoverlapping(
&a as *const _ as *const u8, mem_addr as *mut u8, 8);
@@ -847,59 +967,78 @@ pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) {
/// element is zero.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_move_epi64(a: i64x2) -> i64x2 {
a.replace(1, 0)
// no particular instruction to test
pub unsafe fn _mm_move_epi64(a: i64x2) -> i64x2 {
simd_shuffle2(a, i64x2::splat(0), [0, 2])
}
/// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers
/// using signed saturation.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_packs_epi16(a: i16x8, b: i16x8) -> i8x16 {
unsafe { packsswb(a, b) }
#[cfg_attr(test, assert_instr(packsswb))]
pub unsafe fn _mm_packs_epi16(a: i16x8, b: i16x8) -> i8x16 {
packsswb(a, b)
}
/// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers
/// using signed saturation.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_packs_epi32(a: i32x4, b: i32x4) -> i16x8 {
unsafe { packssdw(a, b) }
#[cfg_attr(test, assert_instr(packssdw))]
pub unsafe fn _mm_packs_epi32(a: i32x4, b: i32x4) -> i16x8 {
packssdw(a, b)
}
/// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers
/// using unsigned saturation.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_packus_epi16(a: i16x8, b: i16x8) -> u8x16 {
unsafe { packuswb(a, b) }
#[cfg_attr(test, assert_instr(packuswb))]
pub unsafe fn _mm_packus_epi16(a: i16x8, b: i16x8) -> u8x16 {
packuswb(a, b)
}
/// Return the `imm8` element of `a`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_extract_epi16(a: i16x8, imm8: i32) -> i32 {
pub unsafe fn _mm_extract_epi16(a: i16x8, imm8: i32) -> i32 {
a.extract(imm8 as u32 & 0b111) as i32
}
#[cfg(test)]
#[target_feature = "+sse2"]
#[cfg_attr(test, assert_instr(pextrw))]
fn _test_mm_extract_epi16(a: i16x8) -> i32 {
unsafe { _mm_extract_epi16(a, 9) }
}
/// Return a new vector where the `imm8` element of `a` is replaced with `i`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_insert_epi16(a: i16x8, i: i32, imm8: i32) -> i16x8 {
pub unsafe fn _mm_insert_epi16(a: i16x8, i: i32, imm8: i32) -> i16x8 {
a.replace(imm8 as u32 & 0b111, i as i16)
}
#[cfg(test)]
#[target_feature = "+sse2"]
#[cfg_attr(test, assert_instr(pinsrw))]
fn _test_mm_insert_epi16(a: i16x8, i: i32) -> i16x8 {
unsafe { _mm_insert_epi16(a, i, 9) }
}
/// Return a mask of the most significant bit of each element in `a`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_movemask_epi8(a: i8x16) -> i32 {
unsafe { pmovmskb(a) }
#[cfg_attr(test, assert_instr(pmovmskb))]
pub unsafe fn _mm_movemask_epi8(a: i8x16) -> i32 {
pmovmskb(a)
}
/// Shuffle 32-bit integers in `a` using the control in `imm8`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_shuffle_epi32(a: i32x4, imm8: i32) -> i32x4 {
pub unsafe fn _mm_shuffle_epi32(a: i32x4, imm8: i32) -> i32x4 {
// simd_shuffleX requires that its selector parameter be made up of
// constant values, but we can't enforce that here. In spirit, we need
// to write a `match` on all possible values of a byte, and for each value,
@@ -911,9 +1050,7 @@ pub fn _mm_shuffle_epi32(a: i32x4, imm8: i32) -> i32x4 {
macro_rules! shuffle_done {
($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
unsafe {
simd_shuffle4(a, a, [$x01, $x23, $x45, $x67])
}
simd_shuffle4(a, a, [$x01, $x23, $x45, $x67])
}
}
macro_rules! shuffle_x67 {
@@ -954,6 +1091,13 @@ macro_rules! shuffle_x23 {
}
}
#[cfg(test)]
#[target_feature = "+sse2"]
#[cfg_attr(test, assert_instr(pshufd))]
fn _test_mm_shuffle_epi32(a: i32x4) -> i32x4 {
unsafe { _mm_shuffle_epi32(a, 9) }
}
/// Shuffle 16-bit integers in the high 64 bits of `a` using the control in
/// `imm8`.
///
@@ -961,18 +1105,16 @@ macro_rules! shuffle_x23 {
/// bits being copied from from `a`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_shufflehi_epi16(a: i16x8, imm8: i32) -> i16x8 {
pub unsafe fn _mm_shufflehi_epi16(a: i16x8, imm8: i32) -> i16x8 {
// See _mm_shuffle_epi32.
let imm8 = (imm8 & 0xFF) as u8;
const fn add4(x: u32) -> u32 { x + 4 }
macro_rules! shuffle_done {
($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
unsafe {
simd_shuffle8(a, a, [
0, 1, 2, 3, add4($x01), add4($x23), add4($x45), add4($x67),
])
}
simd_shuffle8(a, a, [
0, 1, 2, 3, add4($x01), add4($x23), add4($x45), add4($x67),
])
}
}
macro_rules! shuffle_x67 {
@@ -1013,6 +1155,13 @@ macro_rules! shuffle_x23 {
}
}
#[cfg(test)]
#[target_feature = "+sse2"]
#[cfg_attr(test, assert_instr(pshufhw))]
fn _test_mm_shufflehi_epi16(a: i16x8) -> i16x8 {
unsafe { _mm_shufflehi_epi16(a, 9) }
}
/// Shuffle 16-bit integers in the low 64 bits of `a` using the control in
/// `imm8`.
///
@@ -1020,15 +1169,13 @@ macro_rules! shuffle_x23 {
/// bits being copied from from `a`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_shufflelo_epi16(a: i16x8, imm8: i32) -> i16x8 {
pub unsafe fn _mm_shufflelo_epi16(a: i16x8, imm8: i32) -> i16x8 {
// See _mm_shuffle_epi32.
let imm8 = (imm8 & 0xFF) as u8;
macro_rules! shuffle_done {
($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
unsafe {
simd_shuffle8(a, a, [$x01, $x23, $x45, $x67, 4, 5, 6, 7])
}
simd_shuffle8(a, a, [$x01, $x23, $x45, $x67, 4, 5, 6, 7])
}
}
macro_rules! shuffle_x67 {
@@ -1069,77 +1216,89 @@ macro_rules! shuffle_x23 {
}
}
#[cfg(test)]
#[target_feature = "+sse2"]
#[cfg_attr(test, assert_instr(pshuflw))]
fn _test_mm_shufflelo_epi16(a: i16x8) -> i16x8 {
unsafe { _mm_shufflelo_epi16(a, 9) }
}
/// Unpack and interleave 8-bit integers from the high half of `a` and `b`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_unpackhi_epi8(a: i8x16, b: i8x16) -> i8x16 {
unsafe {
simd_shuffle16(a, b, [
8, 24, 9, 25, 10, 26, 11, 27,
12, 28, 13, 29, 14, 30, 15, 31,
])
}
#[cfg_attr(test, assert_instr(punpckhbw))]
pub unsafe fn _mm_unpackhi_epi8(a: i8x16, b: i8x16) -> i8x16 {
simd_shuffle16(a, b, [
8, 24, 9, 25, 10, 26, 11, 27,
12, 28, 13, 29, 14, 30, 15, 31,
])
}
/// Unpack and interleave 16-bit integers from the high half of `a` and `b`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_unpackhi_epi16(a: i16x8, b: i16x8) -> i16x8 {
unsafe { simd_shuffle8(a, b, [4, 12, 5, 13, 6, 14, 7, 15]) }
#[cfg_attr(test, assert_instr(punpckhwd))]
pub unsafe fn _mm_unpackhi_epi16(a: i16x8, b: i16x8) -> i16x8 {
simd_shuffle8(a, b, [4, 12, 5, 13, 6, 14, 7, 15])
}
/// Unpack and interleave 32-bit integers from the high half of `a` and `b`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_unpackhi_epi32(a: i32x4, b: i32x4) -> i32x4 {
unsafe { simd_shuffle4(a, b, [2, 6, 3, 7]) }
#[cfg_attr(test, assert_instr(punpckhdq))]
pub unsafe fn _mm_unpackhi_epi32(a: i32x4, b: i32x4) -> i32x4 {
simd_shuffle4(a, b, [2, 6, 3, 7])
}
/// Unpack and interleave 64-bit integers from the high half of `a` and `b`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_unpackhi_epi64(a: i64x2, b: i64x2) -> i64x2 {
unsafe { simd_shuffle2(a, b, [1, 3]) }
#[cfg_attr(test, assert_instr(punpckhqdq))]
pub unsafe fn _mm_unpackhi_epi64(a: i64x2, b: i64x2) -> i64x2 {
simd_shuffle2(a, b, [1, 3])
}
/// Unpack and interleave 8-bit integers from the low half of `a` and `b`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_unpacklo_epi8(a: i8x16, b: i8x16) -> i8x16 {
unsafe {
simd_shuffle16(a, b, [
0, 16, 1, 17, 2, 18, 3, 19,
4, 20, 5, 21, 6, 22, 7, 23,
])
}
#[cfg_attr(test, assert_instr(punpcklbw))]
pub unsafe fn _mm_unpacklo_epi8(a: i8x16, b: i8x16) -> i8x16 {
simd_shuffle16(a, b, [
0, 16, 1, 17, 2, 18, 3, 19,
4, 20, 5, 21, 6, 22, 7, 23,
])
}
/// Unpack and interleave 16-bit integers from the low half of `a` and `b`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_unpacklo_epi16(a: i16x8, b: i16x8) -> i16x8 {
unsafe { simd_shuffle8(a, b, [0, 8, 1, 9, 2, 10, 3, 11]) }
#[cfg_attr(test, assert_instr(punpcklwd))]
pub unsafe fn _mm_unpacklo_epi16(a: i16x8, b: i16x8) -> i16x8 {
simd_shuffle8(a, b, [0, 8, 1, 9, 2, 10, 3, 11])
}
/// Unpack and interleave 32-bit integers from the low half of `a` and `b`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_unpacklo_epi32(a: i32x4, b: i32x4) -> i32x4 {
unsafe { simd_shuffle4(a, b, [0, 4, 1, 5]) }
#[cfg_attr(test, assert_instr(punpckldq))]
pub unsafe fn _mm_unpacklo_epi32(a: i32x4, b: i32x4) -> i32x4 {
simd_shuffle4(a, b, [0, 4, 1, 5])
}
/// Unpack and interleave 64-bit integers from the low half of `a` and `b`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_unpacklo_epi64(a: i64x2, b: i64x2) -> i64x2 {
unsafe { simd_shuffle2(a, b, [0, 2]) }
#[cfg_attr(test, assert_instr(punpcklqdq))]
pub unsafe fn _mm_unpacklo_epi64(a: i64x2, b: i64x2) -> i64x2 {
simd_shuffle2(a, b, [0, 2])
}
/// Return a new vector with the low element of `a` replaced by the sum of the
/// low elements of `a` and `b`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_add_sd(a: f64x2, b: f64x2) -> f64x2 {
#[cfg_attr(test, assert_instr(addsd))]
pub unsafe fn _mm_add_sd(a: f64x2, b: f64x2) -> f64x2 {
a.replace(0, a.extract(0) + b.extract(0))
}
@@ -1147,7 +1306,8 @@ pub fn _mm_add_sd(a: f64x2, b: f64x2) -> f64x2 {
/// `b`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_add_pd(a: f64x2, b: f64x2) -> f64x2 {
#[cfg_attr(test, assert_instr(addpd))]
pub unsafe fn _mm_add_pd(a: f64x2, b: f64x2) -> f64x2 {
a + b
}
@@ -1155,7 +1315,8 @@ pub fn _mm_add_pd(a: f64x2, b: f64x2) -> f64x2 {
/// diving the lower element of `a` by the lower element of `b`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_div_sd(a: f64x2, b: f64x2) -> f64x2 {
#[cfg_attr(test, assert_instr(divsd))]
pub unsafe fn _mm_div_sd(a: f64x2, b: f64x2) -> f64x2 {
a.replace(0, a.extract(0) / b.extract(0))
}
@@ -1163,7 +1324,8 @@ pub fn _mm_div_sd(a: f64x2, b: f64x2) -> f64x2 {
/// packed elements in `b`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_div_pd(a: f64x2, b: f64x2) -> f64x2 {
#[cfg_attr(test, assert_instr(divpd))]
pub unsafe fn _mm_div_pd(a: f64x2, b: f64x2) -> f64x2 {
a / b
}
@@ -1171,39 +1333,44 @@ pub fn _mm_div_pd(a: f64x2, b: f64x2) -> f64x2 {
/// of the lower elements of `a` and `b`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_max_sd(a: f64x2, b: f64x2) -> f64x2 {
unsafe { maxsd(a, b) }
#[cfg_attr(test, assert_instr(maxsd))]
pub unsafe fn _mm_max_sd(a: f64x2, b: f64x2) -> f64x2 {
maxsd(a, b)
}
/// Return a new vector with the maximum values from corresponding elements in
/// `a` and `b`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_max_pd(a: f64x2, b: f64x2) -> f64x2 {
unsafe { maxpd(a, b) }
#[cfg_attr(test, assert_instr(maxpd))]
pub unsafe fn _mm_max_pd(a: f64x2, b: f64x2) -> f64x2 {
maxpd(a, b)
}
/// Return a new vector with the low element of `a` replaced by the minimum
/// of the lower elements of `a` and `b`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_min_sd(a: f64x2, b: f64x2) -> f64x2 {
unsafe { minsd(a, b) }
#[cfg_attr(test, assert_instr(minsd))]
pub unsafe fn _mm_min_sd(a: f64x2, b: f64x2) -> f64x2 {
minsd(a, b)
}
/// Return a new vector with the minimum values from corresponding elements in
/// `a` and `b`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_min_pd(a: f64x2, b: f64x2) -> f64x2 {
unsafe { minpd(a, b) }
#[cfg_attr(test, assert_instr(minpd))]
pub unsafe fn _mm_min_pd(a: f64x2, b: f64x2) -> f64x2 {
minpd(a, b)
}
/// Return a new vector with the low element of `a` replaced by multiplying the
/// low elements of `a` and `b`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_mul_sd(a: f64x2, b: f64x2) -> f64x2 {
#[cfg_attr(test, assert_instr(mulsd))]
pub unsafe fn _mm_mul_sd(a: f64x2, b: f64x2) -> f64x2 {
a.replace(0, a.extract(0) * b.extract(0))
}
@@ -1211,7 +1378,8 @@ pub fn _mm_mul_sd(a: f64x2, b: f64x2) -> f64x2 {
/// and `b`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_mul_pd(a: f64x2, b: f64x2) -> f64x2 {
#[cfg_attr(test, assert_instr(mulpd))]
pub unsafe fn _mm_mul_pd(a: f64x2, b: f64x2) -> f64x2 {
a * b
}
@@ -1219,22 +1387,25 @@ pub fn _mm_mul_pd(a: f64x2, b: f64x2) -> f64x2 {
/// root of the lower element `b`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_sqrt_sd(a: f64x2, b: f64x2) -> f64x2 {
a.replace(0, unsafe { sqrtsd(b).extract(0) })
#[cfg_attr(test, assert_instr(sqrtsd))]
pub unsafe fn _mm_sqrt_sd(a: f64x2, b: f64x2) -> f64x2 {
a.replace(0, sqrtsd(b).extract(0))
}
/// Return a new vector with the square root of each of the values in `a`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_sqrt_pd(a: f64x2) -> f64x2 {
unsafe { sqrtpd(a) }
#[cfg_attr(test, assert_instr(sqrtpd))]
pub unsafe fn _mm_sqrt_pd(a: f64x2) -> f64x2 {
sqrtpd(a)
}
/// Return a new vector with the low element of `a` replaced by subtracting the
/// low element by `b` from the low element of `a`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_sub_sd(a: f64x2, b: f64x2) -> f64x2 {
#[cfg_attr(test, assert_instr(subsd))]
pub unsafe fn _mm_sub_sd(a: f64x2, b: f64x2) -> f64x2 {
a.replace(0, a.extract(0) - b.extract(0))
}
@@ -1242,7 +1413,8 @@ pub fn _mm_sub_sd(a: f64x2, b: f64x2) -> f64x2 {
/// from `a`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_sub_pd(a: f64x2, b: f64x2) -> f64x2 {
#[cfg_attr(test, assert_instr(subpd))]
pub unsafe fn _mm_sub_pd(a: f64x2, b: f64x2) -> f64x2 {
a - b
}
@@ -1250,76 +1422,76 @@ pub fn _mm_sub_pd(a: f64x2, b: f64x2) -> f64x2 {
/// elements in `a` and `b`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_and_pd(a: f64x2, b: f64x2) -> f64x2 {
unsafe {
let a: i64x2 = mem::transmute(a);
let b: i64x2 = mem::transmute(b);
mem::transmute(a & b)
}
#[cfg_attr(test, assert_instr(andps))]
pub unsafe fn _mm_and_pd(a: f64x2, b: f64x2) -> f64x2 {
let a: i64x2 = mem::transmute(a);
let b: i64x2 = mem::transmute(b);
mem::transmute(a & b)
}
/// Compute the bitwise NOT of `a` and then AND with `b`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_andnot_pd(a: f64x2, b: f64x2) -> f64x2 {
unsafe {
let a: i64x2 = mem::transmute(a);
let b: i64x2 = mem::transmute(b);
mem::transmute((!a) & b)
}
#[cfg_attr(test, assert_instr(andnps))]
pub unsafe fn _mm_andnot_pd(a: f64x2, b: f64x2) -> f64x2 {
let a: i64x2 = mem::transmute(a);
let b: i64x2 = mem::transmute(b);
mem::transmute((!a) & b)
}
/// Compute the bitwise OR of `a` and `b`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_or_pd(a: f64x2, b: f64x2) -> f64x2 {
unsafe {
let a: i64x2 = mem::transmute(a);
let b: i64x2 = mem::transmute(b);
mem::transmute(a | b)
}
#[cfg_attr(test, assert_instr(orps))]
pub unsafe fn _mm_or_pd(a: f64x2, b: f64x2) -> f64x2 {
let a: i64x2 = mem::transmute(a);
let b: i64x2 = mem::transmute(b);
mem::transmute(a | b)
}
/// Compute the bitwise OR of `a` and `b`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_xor_pd(a: f64x2, b: f64x2) -> f64x2 {
unsafe {
let a: i64x2 = mem::transmute(a);
let b: i64x2 = mem::transmute(b);
mem::transmute(a ^ b)
}
#[cfg_attr(test, assert_instr(xorps))]
pub unsafe fn _mm_xor_pd(a: f64x2, b: f64x2) -> f64x2 {
let a: i64x2 = mem::transmute(a);
let b: i64x2 = mem::transmute(b);
mem::transmute(a ^ b)
}
/// Return a new vector with the low element of `a` replaced by the equality
/// comparison of the lower elements of `a` and `b`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmpeq_sd(a: f64x2, b: f64x2) -> f64x2 {
unsafe { cmpsd(a, b, 0) }
#[cfg_attr(test, assert_instr(cmpeqsd))]
pub unsafe fn _mm_cmpeq_sd(a: f64x2, b: f64x2) -> f64x2 {
cmpsd(a, b, 0)
}
/// Return a new vector with the low element of `a` replaced by the less-than
/// comparison of the lower elements of `a` and `b`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmplt_sd(a: f64x2, b: f64x2) -> f64x2 {
unsafe { cmpsd(a, b, 1) }
#[cfg_attr(test, assert_instr(cmpltsd))]
pub unsafe fn _mm_cmplt_sd(a: f64x2, b: f64x2) -> f64x2 {
cmpsd(a, b, 1)
}
/// Return a new vector with the low element of `a` replaced by the
/// less-than-or-equal comparison of the lower elements of `a` and `b`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmple_sd(a: f64x2, b: f64x2) -> f64x2 {
unsafe { cmpsd(a, b, 2) }
#[cfg_attr(test, assert_instr(cmplesd))]
pub unsafe fn _mm_cmple_sd(a: f64x2, b: f64x2) -> f64x2 {
cmpsd(a, b, 2)
}
/// Return a new vector with the low element of `a` replaced by the
/// greater-than comparison of the lower elements of `a` and `b`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmpgt_sd(a: f64x2, b: f64x2) -> f64x2 {
#[cfg_attr(test, assert_instr(cmpltsd))]
pub unsafe fn _mm_cmpgt_sd(a: f64x2, b: f64x2) -> f64x2 {
_mm_cmplt_sd(b, a).replace(1, a.extract(1))
}
@@ -1327,7 +1499,8 @@ pub fn _mm_cmpgt_sd(a: f64x2, b: f64x2) -> f64x2 {
/// greater-than-or-equal comparison of the lower elements of `a` and `b`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmpge_sd(a: f64x2, b: f64x2) -> f64x2 {
#[cfg_attr(test, assert_instr(cmplesd))]
pub unsafe fn _mm_cmpge_sd(a: f64x2, b: f64x2) -> f64x2 {
_mm_cmple_sd(b, a).replace(1, a.extract(1))
}
@@ -1337,8 +1510,9 @@ pub fn _mm_cmpge_sd(a: f64x2, b: f64x2) -> f64x2 {
/// otherwise.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmpord_sd(a: f64x2, b: f64x2) -> f64x2 {
unsafe { cmpsd(a, b, 7) }
#[cfg_attr(test, assert_instr(cmpordsd))]
pub unsafe fn _mm_cmpord_sd(a: f64x2, b: f64x2) -> f64x2 {
cmpsd(a, b, 7)
}
/// Return a new vector with the low element of `a` replaced by the result of
@@ -1346,39 +1520,44 @@ pub fn _mm_cmpord_sd(a: f64x2, b: f64x2) -> f64x2 {
/// equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0` otherwise.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmpunord_sd(a: f64x2, b: f64x2) -> f64x2 {
unsafe { cmpsd(a, b, 3) }
#[cfg_attr(test, assert_instr(cmpunordsd))]
pub unsafe fn _mm_cmpunord_sd(a: f64x2, b: f64x2) -> f64x2 {
cmpsd(a, b, 3)
}
/// Return a new vector with the low element of `a` replaced by the not-equal
/// comparison of the lower elements of `a` and `b`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmpneq_sd(a: f64x2, b: f64x2) -> f64x2 {
unsafe { cmpsd(a, b, 4) }
#[cfg_attr(test, assert_instr(cmpneqsd))]
pub unsafe fn _mm_cmpneq_sd(a: f64x2, b: f64x2) -> f64x2 {
cmpsd(a, b, 4)
}
/// Return a new vector with the low element of `a` replaced by the
/// not-less-than comparison of the lower elements of `a` and `b`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmpnlt_sd(a: f64x2, b: f64x2) -> f64x2 {
unsafe { cmpsd(a, b, 5) }
#[cfg_attr(test, assert_instr(cmpnltsd))]
pub unsafe fn _mm_cmpnlt_sd(a: f64x2, b: f64x2) -> f64x2 {
cmpsd(a, b, 5)
}
/// Return a new vector with the low element of `a` replaced by the
/// not-less-than-or-equal comparison of the lower elements of `a` and `b`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmpnle_sd(a: f64x2, b: f64x2) -> f64x2 {
unsafe { cmpsd(a, b, 6) }
#[cfg_attr(test, assert_instr(cmpnlesd))]
pub unsafe fn _mm_cmpnle_sd(a: f64x2, b: f64x2) -> f64x2 {
cmpsd(a, b, 6)
}
/// Return a new vector with the low element of `a` replaced by the
/// not-greater-than comparison of the lower elements of `a` and `b`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmpngt_sd(a: f64x2, b: f64x2) -> f64x2 {
#[cfg_attr(test, assert_instr(cmpnltsd))]
pub unsafe fn _mm_cmpngt_sd(a: f64x2, b: f64x2) -> f64x2 {
_mm_cmpnlt_sd(b, a).replace(1, a.extract(1))
}
@@ -1386,84 +1565,96 @@ pub fn _mm_cmpngt_sd(a: f64x2, b: f64x2) -> f64x2 {
/// not-greater-than-or-equal comparison of the lower elements of `a` and `b`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmpnge_sd(a: f64x2, b: f64x2) -> f64x2 {
#[cfg_attr(test, assert_instr(cmpnlesd))]
pub unsafe fn _mm_cmpnge_sd(a: f64x2, b: f64x2) -> f64x2 {
_mm_cmpnle_sd(b, a).replace(1, a.extract(1))
}
/// Compare corresponding elements in `a` and `b` for equality.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmpeq_pd(a: f64x2, b: f64x2) -> f64x2 {
unsafe { cmppd(a, b, 0) }
#[cfg_attr(test, assert_instr(cmpeqpd))]
pub unsafe fn _mm_cmpeq_pd(a: f64x2, b: f64x2) -> f64x2 {
cmppd(a, b, 0)
}
/// Compare corresponding elements in `a` and `b` for less-than.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmplt_pd(a: f64x2, b: f64x2) -> f64x2 {
unsafe { cmppd(a, b, 1) }
#[cfg_attr(test, assert_instr(cmpltpd))]
pub unsafe fn _mm_cmplt_pd(a: f64x2, b: f64x2) -> f64x2 {
cmppd(a, b, 1)
}
/// Compare corresponding elements in `a` and `b` for less-than-or-equal
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmple_pd(a: f64x2, b: f64x2) -> f64x2 {
unsafe { cmppd(a, b, 2) }
#[cfg_attr(test, assert_instr(cmplepd))]
pub unsafe fn _mm_cmple_pd(a: f64x2, b: f64x2) -> f64x2 {
cmppd(a, b, 2)
}
/// Compare corresponding elements in `a` and `b` for greater-than.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmpgt_pd(a: f64x2, b: f64x2) -> f64x2 {
#[cfg_attr(test, assert_instr(cmpltpd))]
pub unsafe fn _mm_cmpgt_pd(a: f64x2, b: f64x2) -> f64x2 {
_mm_cmplt_pd(b, a)
}
/// Compare corresponding elements in `a` and `b` for greater-than-or-equal.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmpge_pd(a: f64x2, b: f64x2) -> f64x2 {
#[cfg_attr(test, assert_instr(cmplepd))]
pub unsafe fn _mm_cmpge_pd(a: f64x2, b: f64x2) -> f64x2 {
_mm_cmple_pd(b, a)
}
/// Compare corresponding elements in `a` and `b` to see if neither is `NaN`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmpord_pd(a: f64x2, b: f64x2) -> f64x2 {
unsafe { cmppd(a, b, 7) }
#[cfg_attr(test, assert_instr(cmpordpd))]
pub unsafe fn _mm_cmpord_pd(a: f64x2, b: f64x2) -> f64x2 {
cmppd(a, b, 7)
}
/// Compare corresponding elements in `a` and `b` to see if either is `NaN`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmpunord_pd(a: f64x2, b: f64x2) -> f64x2 {
unsafe { cmppd(a, b, 3) }
#[cfg_attr(test, assert_instr(cmpunordpd))]
pub unsafe fn _mm_cmpunord_pd(a: f64x2, b: f64x2) -> f64x2 {
cmppd(a, b, 3)
}
/// Compare corresponding elements in `a` and `b` for not-equal.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmpneq_pd(a: f64x2, b: f64x2) -> f64x2 {
unsafe { cmppd(a, b, 4) }
#[cfg_attr(test, assert_instr(cmpneqpd))]
pub unsafe fn _mm_cmpneq_pd(a: f64x2, b: f64x2) -> f64x2 {
cmppd(a, b, 4)
}
/// Compare corresponding elements in `a` and `b` for not-less-than.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmpnlt_pd(a: f64x2, b: f64x2) -> f64x2 {
unsafe { cmppd(a, b, 5) }
#[cfg_attr(test, assert_instr(cmpnltpd))]
pub unsafe fn _mm_cmpnlt_pd(a: f64x2, b: f64x2) -> f64x2 {
cmppd(a, b, 5)
}
/// Compare corresponding elements in `a` and `b` for not-less-than-or-equal.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmpnle_pd(a: f64x2, b: f64x2) -> f64x2 {
unsafe { cmppd(a, b, 6) }
#[cfg_attr(test, assert_instr(cmpnlepd))]
pub unsafe fn _mm_cmpnle_pd(a: f64x2, b: f64x2) -> f64x2 {
cmppd(a, b, 6)
}
/// Compare corresponding elements in `a` and `b` for not-greater-than.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmpngt_pd(a: f64x2, b: f64x2) -> f64x2 {
#[cfg_attr(test, assert_instr(cmpnltpd))]
pub unsafe fn _mm_cmpngt_pd(a: f64x2, b: f64x2) -> f64x2 {
_mm_cmpnlt_pd(b, a)
}
@@ -1471,92 +1662,105 @@ pub fn _mm_cmpngt_pd(a: f64x2, b: f64x2) -> f64x2 {
/// not-greater-than-or-equal.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmpnge_pd(a: f64x2, b: f64x2) -> f64x2 {
#[cfg_attr(test, assert_instr(cmpnlepd))]
pub unsafe fn _mm_cmpnge_pd(a: f64x2, b: f64x2) -> f64x2 {
_mm_cmpnle_pd(b, a)
}
/// Compare the lower element of `a` and `b` for equality.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_comieq_sd(a: f64x2, b: f64x2) -> bool {
unsafe { mem::transmute(comieqsd(a, b) as u8) }
#[cfg_attr(test, assert_instr(comisd))]
pub unsafe fn _mm_comieq_sd(a: f64x2, b: f64x2) -> bool {
mem::transmute(comieqsd(a, b) as u8)
}
/// Compare the lower element of `a` and `b` for less-than.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_comilt_sd(a: f64x2, b: f64x2) -> bool {
unsafe { mem::transmute(comiltsd(a, b) as u8) }
#[cfg_attr(test, assert_instr(comisd))]
pub unsafe fn _mm_comilt_sd(a: f64x2, b: f64x2) -> bool {
mem::transmute(comiltsd(a, b) as u8)
}
/// Compare the lower element of `a` and `b` for less-than-or-equal.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_comile_sd(a: f64x2, b: f64x2) -> bool {
unsafe { mem::transmute(comilesd(a, b) as u8) }
#[cfg_attr(test, assert_instr(comisd))]
pub unsafe fn _mm_comile_sd(a: f64x2, b: f64x2) -> bool {
mem::transmute(comilesd(a, b) as u8)
}
/// Compare the lower element of `a` and `b` for greater-than.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_comigt_sd(a: f64x2, b: f64x2) -> bool {
unsafe { mem::transmute(comigtsd(a, b) as u8) }
#[cfg_attr(test, assert_instr(comisd))]
pub unsafe fn _mm_comigt_sd(a: f64x2, b: f64x2) -> bool {
mem::transmute(comigtsd(a, b) as u8)
}
/// Compare the lower element of `a` and `b` for greater-than-or-equal.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_comige_sd(a: f64x2, b: f64x2) -> bool {
unsafe { mem::transmute(comigesd(a, b) as u8) }
#[cfg_attr(test, assert_instr(comisd))]
pub unsafe fn _mm_comige_sd(a: f64x2, b: f64x2) -> bool {
mem::transmute(comigesd(a, b) as u8)
}
/// Compare the lower element of `a` and `b` for not-equal.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_comineq_sd(a: f64x2, b: f64x2) -> bool {
unsafe { mem::transmute(comineqsd(a, b) as u8) }
#[cfg_attr(test, assert_instr(comisd))]
pub unsafe fn _mm_comineq_sd(a: f64x2, b: f64x2) -> bool {
mem::transmute(comineqsd(a, b) as u8)
}
/// Compare the lower element of `a` and `b` for equality.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_ucomieq_sd(a: f64x2, b: f64x2) -> bool {
unsafe { mem::transmute(ucomieqsd(a, b) as u8) }
#[cfg_attr(test, assert_instr(ucomisd))]
pub unsafe fn _mm_ucomieq_sd(a: f64x2, b: f64x2) -> bool {
mem::transmute(ucomieqsd(a, b) as u8)
}
/// Compare the lower element of `a` and `b` for less-than.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_ucomilt_sd(a: f64x2, b: f64x2) -> bool {
unsafe { mem::transmute(ucomiltsd(a, b) as u8) }
#[cfg_attr(test, assert_instr(ucomisd))]
pub unsafe fn _mm_ucomilt_sd(a: f64x2, b: f64x2) -> bool {
mem::transmute(ucomiltsd(a, b) as u8)
}
/// Compare the lower element of `a` and `b` for less-than-or-equal.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_ucomile_sd(a: f64x2, b: f64x2) -> bool {
unsafe { mem::transmute(ucomilesd(a, b) as u8) }
#[cfg_attr(test, assert_instr(ucomisd))]
pub unsafe fn _mm_ucomile_sd(a: f64x2, b: f64x2) -> bool {
mem::transmute(ucomilesd(a, b) as u8)
}
/// Compare the lower element of `a` and `b` for greater-than.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_ucomigt_sd(a: f64x2, b: f64x2) -> bool {
unsafe { mem::transmute(ucomigtsd(a, b) as u8) }
#[cfg_attr(test, assert_instr(ucomisd))]
pub unsafe fn _mm_ucomigt_sd(a: f64x2, b: f64x2) -> bool {
mem::transmute(ucomigtsd(a, b) as u8)
}
/// Compare the lower element of `a` and `b` for greater-than-or-equal.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_ucomige_sd(a: f64x2, b: f64x2) -> bool {
unsafe { mem::transmute(ucomigesd(a, b) as u8) }
#[cfg_attr(test, assert_instr(ucomisd))]
pub unsafe fn _mm_ucomige_sd(a: f64x2, b: f64x2) -> bool {
mem::transmute(ucomigesd(a, b) as u8)
}
/// Compare the lower element of `a` and `b` for not-equal.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_ucomineq_sd(a: f64x2, b: f64x2) -> bool {
unsafe { mem::transmute(ucomineqsd(a, b) as u8) }
#[cfg_attr(test, assert_instr(ucomisd))]
pub unsafe fn _mm_ucomineq_sd(a: f64x2, b: f64x2) -> bool {
mem::transmute(ucomineqsd(a, b) as u8)
}
/// Return a mask of the most significant bit of each element in `a`.
@@ -1565,8 +1769,9 @@ pub fn _mm_ucomineq_sd(a: f64x2, b: f64x2) -> bool {
/// All other bits are set to `0`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_movemask_pd(a: f64x2) -> i32 {
unsafe { movmskpd(a) }
#[cfg_attr(test, assert_instr(movmskpd))]
pub unsafe fn _mm_movemask_pd(a: f64x2) -> i32 {
movmskpd(a)
}
@@ -1574,12 +1779,14 @@ pub fn _mm_movemask_pd(a: f64x2) -> i32 {
#[inline(always)]
#[target_feature = "+sse2"]
#[cfg_attr(test, assert_instr(movaps))]
pub unsafe fn _mm_load_pd(mem_addr: *const f64) -> f64x2 {
*(mem_addr as *const f64x2)
}
#[inline(always)]
#[target_feature = "+sse2"]
#[cfg_attr(test, assert_instr(movaps))]
pub unsafe fn _mm_store_pd(mem_addr: *mut f64, a: f64x2) {
*(mem_addr as *mut f64x2) = a;
}
@@ -1730,7 +1937,7 @@ mod tests {
#[simd_test = "sse2"]
fn _mm_pause() {
sse2::_mm_pause();
unsafe { sse2::_mm_pause() };
}
#[simd_test = "sse2"]
@@ -1741,12 +1948,12 @@ fn _mm_clflush() {
#[simd_test = "sse2"]
fn _mm_lfence() {
sse2::_mm_lfence();
unsafe { sse2::_mm_lfence() };
}
#[simd_test = "sse2"]
fn _mm_mfence() {
sse2::_mm_mfence();
unsafe { sse2::_mm_mfence() };
}
#[simd_test = "sse2"]
@@ -1755,7 +1962,7 @@ fn _mm_add_epi8() {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
let b = i8x16::new(
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
let r = sse2::_mm_add_epi8(a, b);
let r = unsafe { sse2::_mm_add_epi8(a, b) };
let e = i8x16::new(
16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46);
assert_eq!(r, e);
@@ -1765,7 +1972,7 @@ fn _mm_add_epi8() {
fn _mm_add_epi8_overflow() {
let a = i8x16::splat(0x7F);
let b = i8x16::splat(1);
let r = sse2::_mm_add_epi8(a, b);
let r = unsafe { sse2::_mm_add_epi8(a, b) };
assert_eq!(r, i8x16::splat(-128));
}
@@ -1773,7 +1980,7 @@ fn _mm_add_epi8_overflow() {
fn _mm_add_epi16() {
let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
let b = i16x8::new(8, 9, 10, 11, 12, 13, 14, 15);
let r = sse2::_mm_add_epi16(a, b);
let r = unsafe { sse2::_mm_add_epi16(a, b) };
let e = i16x8::new(8, 10, 12, 14, 16, 18, 20, 22);
assert_eq!(r, e);
}
@@ -1782,7 +1989,7 @@ fn _mm_add_epi16() {
fn _mm_add_epi32() {
let a = i32x4::new(0, 1, 2, 3);
let b = i32x4::new(4, 5, 6, 7);
let r = sse2::_mm_add_epi32(a, b);
let r = unsafe { sse2::_mm_add_epi32(a, b) };
let e = i32x4::new(4, 6, 8, 10);
assert_eq!(r, e);
}
@@ -1791,7 +1998,7 @@ fn _mm_add_epi32() {
fn _mm_add_epi64() {
let a = i64x2::new(0, 1);
let b = i64x2::new(2, 3);
let r = sse2::_mm_add_epi64(a, b);
let r = unsafe { sse2::_mm_add_epi64(a, b) };
let e = i64x2::new(2, 4);
assert_eq!(r, e);
}
@@ -1802,7 +2009,7 @@ fn _mm_adds_epi8() {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
let b = i8x16::new(
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
let r = sse2::_mm_adds_epi8(a, b);
let r = unsafe { sse2::_mm_adds_epi8(a, b) };
let e = i8x16::new(
16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46);
assert_eq!(r, e);
@@ -1812,7 +2019,7 @@ fn _mm_adds_epi8() {
fn _mm_adds_epi8_saturate_positive() {
let a = i8x16::splat(0x7F);
let b = i8x16::splat(1);
let r = sse2::_mm_adds_epi8(a, b);
let r = unsafe { sse2::_mm_adds_epi8(a, b) };
assert_eq!(r, a);
}
@@ -1820,7 +2027,7 @@ fn _mm_adds_epi8_saturate_positive() {
fn _mm_adds_epi8_saturate_negative() {
let a = i8x16::splat(-0x80);
let b = i8x16::splat(-1);
let r = sse2::_mm_adds_epi8(a, b);
let r = unsafe { sse2::_mm_adds_epi8(a, b) };
assert_eq!(r, a);
}
@@ -1828,7 +2035,7 @@ fn _mm_adds_epi8_saturate_negative() {
fn _mm_adds_epi16() {
let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
let b = i16x8::new(8, 9, 10, 11, 12, 13, 14, 15);
let r = sse2::_mm_adds_epi16(a, b);
let r = unsafe { sse2::_mm_adds_epi16(a, b) };
let e = i16x8::new(8, 10, 12, 14, 16, 18, 20, 22);
assert_eq!(r, e);
}
@@ -1837,7 +2044,7 @@ fn _mm_adds_epi16() {
fn _mm_adds_epi16_saturate_positive() {
let a = i16x8::splat(0x7FFF);
let b = i16x8::splat(1);
let r = sse2::_mm_adds_epi16(a, b);
let r = unsafe { sse2::_mm_adds_epi16(a, b) };
assert_eq!(r, a);
}
@@ -1845,7 +2052,7 @@ fn _mm_adds_epi16_saturate_positive() {
fn _mm_adds_epi16_saturate_negative() {
let a = i16x8::splat(-0x8000);
let b = i16x8::splat(-1);
let r = sse2::_mm_adds_epi16(a, b);
let r = unsafe { sse2::_mm_adds_epi16(a, b) };
assert_eq!(r, a);
}
@@ -1855,7 +2062,7 @@ fn _mm_adds_epu8() {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
let b = u8x16::new(
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
let r = sse2::_mm_adds_epu8(a, b);
let r = unsafe { sse2::_mm_adds_epu8(a, b) };
let e = u8x16::new(
16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46);
assert_eq!(r, e);
@@ -1865,7 +2072,7 @@ fn _mm_adds_epu8() {
fn _mm_adds_epu8_saturate() {
let a = u8x16::splat(0xFF);
let b = u8x16::splat(1);
let r = sse2::_mm_adds_epu8(a, b);
let r = unsafe { sse2::_mm_adds_epu8(a, b) };
assert_eq!(r, a);
}
@@ -1873,7 +2080,7 @@ fn _mm_adds_epu8_saturate() {
fn _mm_adds_epu16() {
let a = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
let b = u16x8::new(8, 9, 10, 11, 12, 13, 14, 15);
let r = sse2::_mm_adds_epu16(a, b);
let r = unsafe { sse2::_mm_adds_epu16(a, b) };
let e = u16x8::new(8, 10, 12, 14, 16, 18, 20, 22);
assert_eq!(r, e);
}
@@ -1882,21 +2089,21 @@ fn _mm_adds_epu16() {
fn _mm_adds_epu16_saturate() {
let a = u16x8::splat(0xFFFF);
let b = u16x8::splat(1);
let r = sse2::_mm_adds_epu16(a, b);
let r = unsafe { sse2::_mm_adds_epu16(a, b) };
assert_eq!(r, a);
}
#[simd_test = "sse2"]
fn _mm_avg_epu8() {
let (a, b) = (u8x16::splat(3), u8x16::splat(9));
let r = sse2::_mm_avg_epu8(a, b);
let r = unsafe { sse2::_mm_avg_epu8(a, b) };
assert_eq!(r, u8x16::splat(6));
}
#[simd_test = "sse2"]
fn _mm_avg_epu16() {
let (a, b) = (u16x8::splat(3), u16x8::splat(9));
let r = sse2::_mm_avg_epu16(a, b);
let r = unsafe { sse2::_mm_avg_epu16(a, b) };
assert_eq!(r, u16x8::splat(6));
}
@@ -1904,7 +2111,7 @@ fn _mm_avg_epu16() {
fn _mm_madd_epi16() {
let a = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
let b = i16x8::new(9, 10, 11, 12, 13, 14, 15, 16);
let r = sse2::_mm_madd_epi16(a, b);
let r = unsafe { sse2::_mm_madd_epi16(a, b) };
let e = i32x4::new(29, 81, 149, 233);
assert_eq!(r, e);
}
@@ -1913,7 +2120,7 @@ fn _mm_madd_epi16() {
fn _mm_max_epi16() {
let a = i16x8::splat(1);
let b = i16x8::splat(-1);
let r = sse2::_mm_max_epi16(a, b);
let r = unsafe { sse2::_mm_max_epi16(a, b) };
assert_eq!(r, a);
}
@@ -1921,7 +2128,7 @@ fn _mm_max_epi16() {
fn _mm_max_epu8() {
let a = u8x16::splat(1);
let b = u8x16::splat(255);
let r = sse2::_mm_max_epu8(a, b);
let r = unsafe { sse2::_mm_max_epu8(a, b) };
assert_eq!(r, b);
}
@@ -1929,7 +2136,7 @@ fn _mm_max_epu8() {
fn _mm_min_epi16() {
let a = i16x8::splat(1);
let b = i16x8::splat(-1);
let r = sse2::_mm_min_epi16(a, b);
let r = unsafe { sse2::_mm_min_epi16(a, b) };
assert_eq!(r, b);
}
@@ -1937,28 +2144,28 @@ fn _mm_min_epi16() {
fn _mm_min_epu8() {
let a = u8x16::splat(1);
let b = u8x16::splat(255);
let r = sse2::_mm_min_epu8(a, b);
let r = unsafe { sse2::_mm_min_epu8(a, b) };
assert_eq!(r, a);
}
#[simd_test = "sse2"]
fn _mm_mulhi_epi16() {
let (a, b) = (i16x8::splat(1000), i16x8::splat(-1001));
let r = sse2::_mm_mulhi_epi16(a, b);
let r = unsafe { sse2::_mm_mulhi_epi16(a, b) };
assert_eq!(r, i16x8::splat(-16));
}
#[simd_test = "sse2"]
fn _mm_mulhi_epu16() {
let (a, b) = (u16x8::splat(1000), u16x8::splat(1001));
let r = sse2::_mm_mulhi_epu16(a, b);
let r = unsafe { sse2::_mm_mulhi_epu16(a, b) };
assert_eq!(r, u16x8::splat(15));
}
#[simd_test = "sse2"]
fn _mm_mullo_epi16() {
let (a, b) = (i16x8::splat(1000), i16x8::splat(-1001));
let r = sse2::_mm_mullo_epi16(a, b);
let r = unsafe { sse2::_mm_mullo_epi16(a, b) };
assert_eq!(r, i16x8::splat(-17960));
}
@@ -1966,7 +2173,7 @@ fn _mm_mullo_epi16() {
fn _mm_mul_epu32() {
let a = u32x4::from(u64x2::new(1_000_000_000, 1 << 34));
let b = u32x4::from(u64x2::new(1_000_000_000, 1 << 35));
let r = sse2::_mm_mul_epu32(a, b);
let r = unsafe { sse2::_mm_mul_epu32(a, b) };
let e = u64x2::new(1_000_000_000 * 1_000_000_000, 0);
assert_eq!(r, e);
}
@@ -1979,7 +2186,7 @@ fn _mm_sad_epu8() {
let b = u8x16::new(
0, 0, 0, 0, 2, 1, 2, 1,
1, 1, 1, 1, 1, 2, 1, 2);
let r = sse2::_mm_sad_epu8(a, b);
let r = unsafe { sse2::_mm_sad_epu8(a, b) };
let e = u64x2::new(1020, 614);
assert_eq!(r, e);
}
@@ -1987,35 +2194,35 @@ fn _mm_sad_epu8() {
#[simd_test = "sse2"]
fn _mm_sub_epi8() {
let (a, b) = (i8x16::splat(5), i8x16::splat(6));
let r = sse2::_mm_sub_epi8(a, b);
let r = unsafe { sse2::_mm_sub_epi8(a, b) };
assert_eq!(r, i8x16::splat(-1));
}
#[simd_test = "sse2"]
fn _mm_sub_epi16() {
let (a, b) = (i16x8::splat(5), i16x8::splat(6));
let r = sse2::_mm_sub_epi16(a, b);
let r = unsafe { sse2::_mm_sub_epi16(a, b) };
assert_eq!(r, i16x8::splat(-1));
}
#[simd_test = "sse2"]
fn _mm_sub_epi32() {
let (a, b) = (i32x4::splat(5), i32x4::splat(6));
let r = sse2::_mm_sub_epi32(a, b);
let r = unsafe { sse2::_mm_sub_epi32(a, b) };
assert_eq!(r, i32x4::splat(-1));
}
#[simd_test = "sse2"]
fn _mm_sub_epi64() {
let (a, b) = (i64x2::splat(5), i64x2::splat(6));
let r = sse2::_mm_sub_epi64(a, b);
let r = unsafe { sse2::_mm_sub_epi64(a, b) };
assert_eq!(r, i64x2::splat(-1));
}
#[simd_test = "sse2"]
fn _mm_subs_epi8() {
let (a, b) = (i8x16::splat(5), i8x16::splat(2));
let r = sse2::_mm_subs_epi8(a, b);
let r = unsafe { sse2::_mm_subs_epi8(a, b) };
assert_eq!(r, i8x16::splat(3));
}
@@ -2023,7 +2230,7 @@ fn _mm_subs_epi8() {
fn _mm_subs_epi8_saturate_positive() {
let a = i8x16::splat(0x7F);
let b = i8x16::splat(-1);
let r = sse2::_mm_subs_epi8(a, b);
let r = unsafe { sse2::_mm_subs_epi8(a, b) };
assert_eq!(r, a);
}
@@ -2031,14 +2238,14 @@ fn _mm_subs_epi8_saturate_positive() {
fn _mm_subs_epi8_saturate_negative() {
let a = i8x16::splat(-0x80);
let b = i8x16::splat(1);
let r = sse2::_mm_subs_epi8(a, b);
let r = unsafe { sse2::_mm_subs_epi8(a, b) };
assert_eq!(r, a);
}
#[simd_test = "sse2"]
fn _mm_subs_epi16() {
let (a, b) = (i16x8::splat(5), i16x8::splat(2));
let r = sse2::_mm_subs_epi16(a, b);
let r = unsafe { sse2::_mm_subs_epi16(a, b) };
assert_eq!(r, i16x8::splat(3));
}
@@ -2046,7 +2253,7 @@ fn _mm_subs_epi16() {
fn _mm_subs_epi16_saturate_positive() {
let a = i16x8::splat(0x7FFF);
let b = i16x8::splat(-1);
let r = sse2::_mm_subs_epi16(a, b);
let r = unsafe { sse2::_mm_subs_epi16(a, b) };
assert_eq!(r, a);
}
@@ -2054,14 +2261,14 @@ fn _mm_subs_epi16_saturate_positive() {
fn _mm_subs_epi16_saturate_negative() {
let a = i16x8::splat(-0x8000);
let b = i16x8::splat(1);
let r = sse2::_mm_subs_epi16(a, b);
let r = unsafe { sse2::_mm_subs_epi16(a, b) };
assert_eq!(r, a);
}
#[simd_test = "sse2"]
fn _mm_subs_epu8() {
let (a, b) = (u8x16::splat(5), u8x16::splat(2));
let r = sse2::_mm_subs_epu8(a, b);
let r = unsafe { sse2::_mm_subs_epu8(a, b) };
assert_eq!(r, u8x16::splat(3));
}
@@ -2069,14 +2276,14 @@ fn _mm_subs_epu8() {
fn _mm_subs_epu8_saturate() {
let a = u8x16::splat(0);
let b = u8x16::splat(1);
let r = sse2::_mm_subs_epu8(a, b);
let r = unsafe { sse2::_mm_subs_epu8(a, b) };
assert_eq!(r, a);
}
#[simd_test = "sse2"]
fn _mm_subs_epu16() {
let (a, b) = (u16x8::splat(5), u16x8::splat(2));
let r = sse2::_mm_subs_epu16(a, b);
let r = unsafe { sse2::_mm_subs_epu16(a, b) };
assert_eq!(r, u16x8::splat(3));
}
@@ -2084,7 +2291,7 @@ fn _mm_subs_epu16() {
fn _mm_subs_epu16_saturate() {
let a = u16x8::splat(0);
let b = u16x8::splat(1);
let r = sse2::_mm_subs_epu16(a, b);
let r = unsafe { sse2::_mm_subs_epu16(a, b) };
assert_eq!(r, a);
}
@@ -2092,31 +2299,31 @@ fn _mm_subs_epu16_saturate() {
fn _mm_slli_si128() {
let a = __m128i::new(
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
let r = sse2::_mm_slli_si128(a, 1);
let r = unsafe { sse2::_mm_slli_si128(a, 1) };
let e = __m128i::new(
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
assert_eq!(r, e);
let a = __m128i::new(
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
let r = sse2::_mm_slli_si128(a, 15);
let r = unsafe { sse2::_mm_slli_si128(a, 15) };
let e = __m128i::new(
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
assert_eq!(r, e);
let a = __m128i::new(
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
let r = sse2::_mm_slli_si128(a, 16);
let r = unsafe { sse2::_mm_slli_si128(a, 16) };
assert_eq!(r, __m128i::splat(0));
let a = __m128i::new(
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
let r = sse2::_mm_slli_si128(a, -1);
let r = unsafe { sse2::_mm_slli_si128(a, -1) };
assert_eq!(r, __m128i::splat(0));
let a = __m128i::new(
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
let r = sse2::_mm_slli_si128(a, -0x80000000);
let r = unsafe { sse2::_mm_slli_si128(a, -0x80000000) };
assert_eq!(r, __m128i::splat(0));
}
@@ -2124,7 +2331,7 @@ fn _mm_slli_si128() {
fn _mm_slli_epi16() {
let a = i16x8::new(
0xFFFF as u16 as i16, 0x0FFF, 0x00FF, 0x000F, 0, 0, 0, 0);
let r = sse2::_mm_slli_epi16(a, 4);
let r = unsafe { sse2::_mm_slli_epi16(a, 4) };
let e = i16x8::new(
0xFFF0 as u16 as i16,
0xFFF0 as u16 as i16, 0x0FF0, 0x00F0, 0, 0, 0, 0);
@@ -2134,98 +2341,101 @@ fn _mm_slli_epi16() {
#[simd_test = "sse2"]
fn _mm_sll_epi16() {
let a = i16x8::new(0xFF, 0, 0, 0, 0, 0, 0, 0);
let r = sse2::_mm_sll_epi16(a, i16x8::new(4, 0, 0, 0, 0, 0, 0, 0));
let r = unsafe {
sse2::_mm_sll_epi16(a, i16x8::new(4, 0, 0, 0, 0, 0, 0, 0))
};
assert_eq!(r, i16x8::new(0xFF0, 0, 0, 0, 0, 0, 0, 0));
let r = sse2::_mm_sll_epi16(a, i16x8::new(0, 0, 0, 0, 4, 0, 0, 0));
let r = unsafe {
sse2::_mm_sll_epi16(a, i16x8::new(0, 0, 0, 0, 4, 0, 0, 0))
};
assert_eq!(r, i16x8::new(0xFF, 0, 0, 0, 0, 0, 0, 0));
}
#[simd_test = "sse2"]
fn _mm_slli_epi32() {
assert_eq!(
sse2::_mm_slli_epi32(i32x4::splat(0xFFFF), 4),
i32x4::splat(0xFFFF0));
let r = unsafe { sse2::_mm_slli_epi32(i32x4::splat(0xFFFF), 4) };
assert_eq!(r, i32x4::splat(0xFFFF0));
}
#[simd_test = "sse2"]
fn _mm_sll_epi32() {
assert_eq!(
sse2::_mm_sll_epi32(i32x4::splat(0xFFFF), i32x4::new(4, 0, 0, 0)),
i32x4::splat(0xFFFF0));
let a = i32x4::splat(0xFFFF);
let b = i32x4::new(4, 0, 0, 0);
let r = unsafe { sse2::_mm_sll_epi32(a, b) };
assert_eq!(r, i32x4::splat(0xFFFF0));
}
#[simd_test = "sse2"]
fn _mm_slli_epi64() {
assert_eq!(
sse2::_mm_slli_epi64(i64x2::splat(0xFFFFFFFF), 4),
i64x2::splat(0xFFFFFFFF0));
let r = unsafe { sse2::_mm_slli_epi64(i64x2::splat(0xFFFFFFFF), 4) };
assert_eq!(r, i64x2::splat(0xFFFFFFFF0));
}
#[simd_test = "sse2"]
fn _mm_sll_epi64() {
assert_eq!(
sse2::_mm_sll_epi64(
i64x2::splat(0xFFFFFFFF), i64x2::new(4, 0)),
i64x2::splat(0xFFFFFFFF0));
let a = i64x2::splat(0xFFFFFFFF);
let b = i64x2::new(4, 0);
let r = unsafe { sse2::_mm_sll_epi64(a, b) };
assert_eq!(r, i64x2::splat(0xFFFFFFFF0));
}
#[simd_test = "sse2"]
fn _mm_srai_epi16() {
assert_eq!(
sse2::_mm_srai_epi16(i16x8::splat(-1), 1), i16x8::splat(-1));
let r = unsafe { sse2::_mm_srai_epi16(i16x8::splat(-1), 1) };
assert_eq!(r, i16x8::splat(-1));
}
#[simd_test = "sse2"]
fn _mm_sra_epi16() {
assert_eq!(
sse2::_mm_sra_epi16(
i16x8::splat(-1), i16x8::new(1, 0, 0, 0, 0, 0, 0, 0)),
i16x8::splat(-1));
let a = i16x8::splat(-1);
let b = i16x8::new(1, 0, 0, 0, 0, 0, 0, 0);
let r = unsafe { sse2::_mm_sra_epi16(a, b) };
assert_eq!(r, i16x8::splat(-1));
}
#[simd_test = "sse2"]
fn _mm_srai_epi32() {
assert_eq!(
sse2::_mm_srai_epi32(i32x4::splat(-1), 1), i32x4::splat(-1));
let r = unsafe { sse2::_mm_srai_epi32(i32x4::splat(-1), 1) };
assert_eq!(r, i32x4::splat(-1));
}
#[simd_test = "sse2"]
fn _mm_sra_epi32() {
assert_eq!(
sse2::_mm_sra_epi32(
i32x4::splat(-1), i32x4::new(1, 0, 0, 0)),
i32x4::splat(-1));
let a = i32x4::splat(-1);
let b = i32x4::new(1, 0, 0, 0);
let r = unsafe { sse2::_mm_sra_epi32(a, b) };
assert_eq!(r, i32x4::splat(-1));
}
#[simd_test = "sse2"]
fn _mm_srli_si128() {
let a = __m128i::new(
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
let r = sse2::_mm_srli_si128(a, 1);
let r = unsafe { sse2::_mm_srli_si128(a, 1) };
let e = __m128i::new(
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0);
assert_eq!(r, e);
let a = __m128i::new(
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
let r = sse2::_mm_srli_si128(a, 15);
let r = unsafe { sse2::_mm_srli_si128(a, 15) };
let e = __m128i::new(
16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
assert_eq!(r, e);
let a = __m128i::new(
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
let r = sse2::_mm_srli_si128(a, 16);
let r = unsafe { sse2::_mm_srli_si128(a, 16) };
assert_eq!(r, __m128i::splat(0));
let a = __m128i::new(
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
let r = sse2::_mm_srli_si128(a, -1);
let r = unsafe { sse2::_mm_srli_si128(a, -1) };
assert_eq!(r, __m128i::splat(0));
let a = __m128i::new(
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
let r = sse2::_mm_srli_si128(a, -0x80000000);
let r = unsafe { sse2::_mm_srli_si128(a, -0x80000000) };
assert_eq!(r, __m128i::splat(0));
}
@@ -2233,7 +2443,7 @@ fn _mm_srli_si128() {
fn _mm_srli_epi16() {
let a = i16x8::new(
0xFFFF as u16 as i16, 0x0FFF, 0x00FF, 0x000F, 0, 0, 0, 0);
let r = sse2::_mm_srli_epi16(a, 4);
let r = unsafe { sse2::_mm_srli_epi16(a, 4) };
let e = i16x8::new(
0xFFF as u16 as i16,
0xFF as u16 as i16, 0xF, 0, 0, 0, 0, 0);
@@ -2243,67 +2453,74 @@ fn _mm_srli_epi16() {
#[simd_test = "sse2"]
fn _mm_srl_epi16() {
let a = i16x8::new(0xFF, 0, 0, 0, 0, 0, 0, 0);
let r = sse2::_mm_srl_epi16(a, i16x8::new(4, 0, 0, 0, 0, 0, 0, 0));
let r = unsafe {
sse2::_mm_srl_epi16(a, i16x8::new(4, 0, 0, 0, 0, 0, 0, 0))
};
assert_eq!(r, i16x8::new(0xF, 0, 0, 0, 0, 0, 0, 0));
let r = sse2::_mm_srl_epi16(a, i16x8::new(0, 0, 0, 0, 4, 0, 0, 0));
let r = unsafe {
sse2::_mm_srl_epi16(a, i16x8::new(0, 0, 0, 0, 4, 0, 0, 0))
};
assert_eq!(r, i16x8::new(0xFF, 0, 0, 0, 0, 0, 0, 0));
}
#[simd_test = "sse2"]
fn _mm_srli_epi32() {
assert_eq!(
sse2::_mm_srli_epi32(i32x4::splat(0xFFFF), 4),
i32x4::splat(0xFFF));
let r = unsafe { sse2::_mm_srli_epi32(i32x4::splat(0xFFFF), 4) };
assert_eq!(r, i32x4::splat(0xFFF));
}
#[simd_test = "sse2"]
fn _mm_srl_epi32() {
assert_eq!(
sse2::_mm_srl_epi32(i32x4::splat(0xFFFF), i32x4::new(4, 0, 0, 0)),
i32x4::splat(0xFFF));
let a = i32x4::splat(0xFFFF);
let b = i32x4::new(4, 0, 0, 0);
let r = unsafe { sse2::_mm_srl_epi32(a, b) };
assert_eq!(r, i32x4::splat(0xFFF));
}
#[simd_test = "sse2"]
fn _mm_srli_epi64() {
assert_eq!(
sse2::_mm_srli_epi64(i64x2::splat(0xFFFFFFFF), 4),
i64x2::splat(0xFFFFFFF));
let r = unsafe { sse2::_mm_srli_epi64(i64x2::splat(0xFFFFFFFF), 4) };
assert_eq!(r, i64x2::splat(0xFFFFFFF));
}
#[simd_test = "sse2"]
fn _mm_srl_epi64() {
assert_eq!(
sse2::_mm_srl_epi64(
i64x2::splat(0xFFFFFFFF), i64x2::new(4, 0)),
i64x2::splat(0xFFFFFFF));
let a = i64x2::splat(0xFFFFFFFF);
let b = i64x2::new(4, 0);
let r = unsafe { sse2::_mm_srl_epi64(a, b) };
assert_eq!(r, i64x2::splat(0xFFFFFFF));
}
#[simd_test = "sse2"]
fn _mm_and_si128() {
assert_eq!(
sse2::_mm_and_si128(__m128i::splat(5), __m128i::splat(3)),
__m128i::splat(1));
let a = __m128i::splat(5);
let b = __m128i::splat(3);
let r = unsafe { sse2::_mm_and_si128(a, b) };
assert_eq!(r, __m128i::splat(1));
}
#[simd_test = "sse2"]
fn _mm_andnot_si128() {
assert_eq!(
sse2::_mm_andnot_si128(__m128i::splat(5), __m128i::splat(3)),
__m128i::splat(2));
let a = __m128i::splat(5);
let b = __m128i::splat(3);
let r = unsafe { sse2::_mm_andnot_si128(a, b) };
assert_eq!(r, __m128i::splat(2));
}
#[simd_test = "sse2"]
fn _mm_or_si128() {
assert_eq!(
sse2::_mm_or_si128(__m128i::splat(5), __m128i::splat(3)),
__m128i::splat(7));
let a = __m128i::splat(5);
let b = __m128i::splat(3);
let r = unsafe { sse2::_mm_or_si128(a, b) };
assert_eq!(r, __m128i::splat(7));
}
#[simd_test = "sse2"]
fn _mm_xor_si128() {
assert_eq!(
sse2::_mm_xor_si128(__m128i::splat(5), __m128i::splat(3)),
__m128i::splat(6));
let a = __m128i::splat(5);
let b = __m128i::splat(3);
let r = unsafe { sse2::_mm_xor_si128(a, b) };
assert_eq!(r, __m128i::splat(6));
}
#[simd_test = "sse2"]
@@ -2312,7 +2529,7 @@ fn _mm_cmpeq_epi8() {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
let b = i8x16::new(
15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
let r = sse2::_mm_cmpeq_epi8(a, b);
let r = unsafe { sse2::_mm_cmpeq_epi8(a, b) };
assert_eq!(r, i8x16::new(
0, 0, 0xFFu8 as i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
}
@@ -2321,7 +2538,7 @@ fn _mm_cmpeq_epi8() {
fn _mm_cmpeq_epi16() {
let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
let b = i16x8::new(7, 6, 2, 4, 3, 2, 1, 0);
let r = sse2::_mm_cmpeq_epi16(a, b);
let r = unsafe { sse2::_mm_cmpeq_epi16(a, b) };
assert_eq!(r, i16x8::splat(0).replace(2, 0xFFFFu16 as i16));
}
@@ -2329,7 +2546,7 @@ fn _mm_cmpeq_epi16() {
fn _mm_cmpeq_epi32() {
let a = i32x4::new(0, 1, 2, 3);
let b = i32x4::new(3, 2, 2, 0);
let r = sse2::_mm_cmpeq_epi32(a, b);
let r = unsafe { sse2::_mm_cmpeq_epi32(a, b) };
assert_eq!(r, i32x4::splat(0).replace(2, 0xFFFFFFFFu32 as i32));
}
@@ -2337,7 +2554,7 @@ fn _mm_cmpeq_epi32() {
fn _mm_cmpgt_epi8() {
let a = i8x16::splat(0).replace(0, 5);
let b = i8x16::splat(0);
let r = sse2::_mm_cmpgt_epi8(a, b);
let r = unsafe { sse2::_mm_cmpgt_epi8(a, b) };
assert_eq!(r, i8x16::splat(0).replace(0, 0xFFu8 as i8));
}
@@ -2345,7 +2562,7 @@ fn _mm_cmpgt_epi8() {
fn _mm_cmpgt_epi16() {
let a = i16x8::splat(0).replace(0, 5);
let b = i16x8::splat(0);
let r = sse2::_mm_cmpgt_epi16(a, b);
let r = unsafe { sse2::_mm_cmpgt_epi16(a, b) };
assert_eq!(r, i16x8::splat(0).replace(0, 0xFFFFu16 as i16));
}
@@ -2353,7 +2570,7 @@ fn _mm_cmpgt_epi16() {
fn _mm_cmpgt_epi32() {
let a = i32x4::splat(0).replace(0, 5);
let b = i32x4::splat(0);
let r = sse2::_mm_cmpgt_epi32(a, b);
let r = unsafe { sse2::_mm_cmpgt_epi32(a, b) };
assert_eq!(r, i32x4::splat(0).replace(0, 0xFFFFFFFFu32 as i32));
}
@@ -2361,7 +2578,7 @@ fn _mm_cmpgt_epi32() {
fn _mm_cmplt_epi8() {
let a = i8x16::splat(0);
let b = i8x16::splat(0).replace(0, 5);
let r = sse2::_mm_cmplt_epi8(a, b);
let r = unsafe { sse2::_mm_cmplt_epi8(a, b) };
assert_eq!(r, i8x16::splat(0).replace(0, 0xFFu8 as i8));
}
@@ -2369,7 +2586,7 @@ fn _mm_cmplt_epi8() {
fn _mm_cmplt_epi16() {
let a = i16x8::splat(0);
let b = i16x8::splat(0).replace(0, 5);
let r = sse2::_mm_cmplt_epi16(a, b);
let r = unsafe { sse2::_mm_cmplt_epi16(a, b) };
assert_eq!(r, i16x8::splat(0).replace(0, 0xFFFFu16 as i16));
}
@@ -2377,123 +2594,161 @@ fn _mm_cmplt_epi16() {
fn _mm_cmplt_epi32() {
let a = i32x4::splat(0);
let b = i32x4::splat(0).replace(0, 5);
let r = sse2::_mm_cmplt_epi32(a, b);
let r = unsafe { sse2::_mm_cmplt_epi32(a, b) };
assert_eq!(r, i32x4::splat(0).replace(0, 0xFFFFFFFFu32 as i32));
}
#[simd_test = "sse2"]
fn _mm_cvtepi32_pd() {
let a = sse2::_mm_set_epi32(35, 25, 15, 5);
let r = sse2::_mm_cvtepi32_pd(a);
let a = unsafe { sse2::_mm_set_epi32(35, 25, 15, 5) };
let r = unsafe { sse2::_mm_cvtepi32_pd(a) };
assert_eq!(r, f64x2::new(5.0, 15.0));
}
#[simd_test = "sse2"]
fn _mm_cvtsi32_sd() {
let a = f64x2::splat(3.5);
assert_eq!(sse2::_mm_cvtsi32_sd(a, 5), f64x2::new(5.0, 3.5));
let r = unsafe { sse2::_mm_cvtsi32_sd(a, 5) };
assert_eq!(r, f64x2::new(5.0, 3.5));
}
#[cfg(target_arch = "x86_64")]
#[simd_test = "sse2"]
fn _mm_cvtsi64_sd() {
let a = f64x2::splat(3.5);
assert_eq!(sse2::_mm_cvtsi64_sd(a, 5), f64x2::new(5.0, 3.5));
let r = unsafe { sse2::_mm_cvtsi64_sd(a, 5) };
assert_eq!(r, f64x2::new(5.0, 3.5));
}
#[simd_test = "sse2"]
fn _mm_cvtepi32_ps() {
let a = i32x4::new(1, 2, 3, 4);
assert_eq!(sse2::_mm_cvtepi32_ps(a), f32x4::new(1.0, 2.0, 3.0, 4.0));
let r = unsafe { sse2::_mm_cvtepi32_ps(a) };
assert_eq!(r, f32x4::new(1.0, 2.0, 3.0, 4.0));
}
#[simd_test = "sse2"]
fn _mm_cvtsi32_si128() {
assert_eq!(sse2::_mm_cvtsi32_si128(5), i32x4::new(5, 0, 0, 0));
let r = unsafe { sse2::_mm_cvtsi32_si128(5) };
assert_eq!(r, i32x4::new(5, 0, 0, 0));
}
#[cfg(target_arch = "x86_64")]
#[simd_test = "sse2"]
fn _mm_cvtsi64_si128() {
assert_eq!(sse2::_mm_cvtsi64_si128(5), i64x2::new(5, 0));
let r = unsafe { sse2::_mm_cvtsi64_si128(5) };
assert_eq!(r, i64x2::new(5, 0));
}
#[simd_test = "sse2"]
fn _mm_cvtsi128_si32() {
assert_eq!(sse2::_mm_cvtsi128_si32(i32x4::new(5, 0, 0, 0)), 5);
let r = unsafe { sse2::_mm_cvtsi128_si32(i32x4::new(5, 0, 0, 0)) };
assert_eq!(r, 5);
}
#[cfg(target_arch = "x86_64")]
#[simd_test = "sse2"]
fn _mm_cvtsi128_si64() {
assert_eq!(sse2::_mm_cvtsi128_si64(i64x2::new(5, 0)), 5);
let r = unsafe { sse2::_mm_cvtsi128_si64(i64x2::new(5, 0)) };
assert_eq!(r, 5);
}
#[simd_test = "sse2"]
fn _mm_set_epi64x() {
assert_eq!(sse2::_mm_set_epi64x(0, 1), i64x2::new(1, 0));
let r = unsafe { sse2::_mm_set_epi64x(0, 1) };
assert_eq!(r, i64x2::new(1, 0));
}
#[simd_test = "sse2"]
fn _mm_set_epi32() {
assert_eq!(sse2::_mm_set_epi32(0, 1, 2, 3), i32x4::new(3, 2, 1, 0));
let r = unsafe { sse2::_mm_set_epi32(0, 1, 2, 3) };
assert_eq!(r, i32x4::new(3, 2, 1, 0));
}
#[simd_test = "sse2"]
fn _mm_set_epi16() {
assert_eq!(
sse2::_mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7),
i16x8::new(7, 6, 5, 4, 3, 2, 1, 0));
let r = unsafe { sse2::_mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7) };
assert_eq!(r, i16x8::new(7, 6, 5, 4, 3, 2, 1, 0));
}
#[simd_test = "sse2"]
fn _mm_set_epi8() {
assert_eq!(
let r = unsafe {
sse2::_mm_set_epi8(
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
i8x16::new(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
0, 1, 2, 3,
4, 5, 6, 7,
8, 9, 10, 11,
12, 13, 14, 15,
)
};
let e = i8x16::new(
15, 14, 13, 12,
11, 10, 9, 8,
7, 6, 5, 4,
3, 2, 1, 0,
);
assert_eq!(r, e);
}
#[simd_test = "sse2"]
fn _mm_set1_epi64x() {
assert_eq!(sse2::_mm_set1_epi64x(1), i64x2::splat(1));
let r = unsafe { sse2::_mm_set1_epi64x(1) };
assert_eq!(r, i64x2::splat(1));
}
#[simd_test = "sse2"]
fn _mm_set1_epi32() {
assert_eq!(sse2::_mm_set1_epi32(1), i32x4::splat(1));
let r = unsafe { sse2::_mm_set1_epi32(1) };
assert_eq!(r, i32x4::splat(1));
}
#[simd_test = "sse2"]
fn _mm_set1_epi16() {
assert_eq!(sse2::_mm_set1_epi16(1), i16x8::splat(1));
let r = unsafe { sse2::_mm_set1_epi16(1) };
assert_eq!(r, i16x8::splat(1));
}
#[simd_test = "sse2"]
fn _mm_set1_epi8() {
assert_eq!(sse2::_mm_set1_epi8(1), i8x16::splat(1));
let r = unsafe { sse2::_mm_set1_epi8(1) };
assert_eq!(r, i8x16::splat(1));
}
#[simd_test = "sse2"]
fn _mm_setr_epi32() {
assert_eq!(sse2::_mm_setr_epi32(0, 1, 2, 3), i32x4::new(0, 1, 2, 3));
let r = unsafe { sse2::_mm_setr_epi32(0, 1, 2, 3) };
assert_eq!(r, i32x4::new(0, 1, 2, 3));
}
#[simd_test = "sse2"]
fn _mm_setr_epi16() {
assert_eq!(
sse2::_mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7),
i16x8::new(0, 1, 2, 3, 4, 5, 6, 7));
let r = unsafe { sse2::_mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7) };
assert_eq!(r, i16x8::new(0, 1, 2, 3, 4, 5, 6, 7));
}
#[simd_test = "sse2"]
fn _mm_setr_epi8() {
assert_eq!(
let r = unsafe {
sse2::_mm_setr_epi8(
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15));
0, 1, 2, 3,
4, 5, 6, 7,
8, 9, 10, 11,
12, 13, 14, 15,
)
};
let e = i8x16::new(
0, 1, 2, 3,
4, 5, 6, 7,
8, 9, 10, 11,
12, 13, 14, 15,
);
assert_eq!(r, e);
}
#[simd_test = "sse2"]
fn _mm_setzero_si128() {
assert_eq!(sse2::_mm_setzero_si128(), __m128i::from(i64x2::splat(0)));
let r = unsafe { sse2::_mm_setzero_si128() };
assert_eq!(r, __m128i::from(i64x2::splat(0)));
}
#[simd_test = "sse2"]
@@ -2505,14 +2760,14 @@ fn _mm_loadl_epi64() {
#[simd_test = "sse2"]
fn _mm_load_si128() {
let a = sse2::_mm_set_epi64x(5, 6);
let a = unsafe { sse2::_mm_set_epi64x(5, 6) };
let r = unsafe { sse2::_mm_load_si128(&a as *const _ as *const _) };
assert_eq!(a, i64x2::from(r));
}
#[simd_test = "sse2"]
fn _mm_loadu_si128() {
let a = sse2::_mm_set_epi64x(5, 6);
let a = unsafe { sse2::_mm_set_epi64x(5, 6) };
let r = unsafe { sse2::_mm_loadu_si128(&a as *const _ as *const _) };
assert_eq!(a, i64x2::from(r));
}
@@ -2561,14 +2816,15 @@ fn _mm_storel_epi64() {
#[simd_test = "sse2"]
fn _mm_move_epi64() {
let a = i64x2::new(5, 6);
assert_eq!(sse2::_mm_move_epi64(a), i64x2::new(5, 0));
let r = unsafe { sse2::_mm_move_epi64(a) };
assert_eq!(r, i64x2::new(5, 0));
}
#[simd_test = "sse2"]
fn _mm_packs_epi16() {
let a = i16x8::new(0x80, -0x81, 0, 0, 0, 0, 0, 0);
let b = i16x8::new(0, 0, 0, 0, 0, 0, -0x81, 0x80);
let r = sse2::_mm_packs_epi16(a, b);
let r = unsafe { sse2::_mm_packs_epi16(a, b) };
assert_eq!(r, i8x16::new(
0x7F, -0x80, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, -0x80, 0x7F));
@@ -2578,7 +2834,7 @@ fn _mm_packs_epi16() {
fn _mm_packs_epi32() {
let a = i32x4::new(0x8000, -0x8001, 0, 0);
let b = i32x4::new(0, 0, -0x8001, 0x8000);
let r = sse2::_mm_packs_epi32(a, b);
let r = unsafe { sse2::_mm_packs_epi32(a, b) };
assert_eq!(
r, i16x8::new(0x7FFF, -0x8000, 0, 0, 0, 0, -0x8000, 0x7FFF));
}
@@ -2587,7 +2843,7 @@ fn _mm_packs_epi32() {
fn _mm_packus_epi16() {
let a = i16x8::new(0x100, -1, 0, 0, 0, 0, 0, 0);
let b = i16x8::new(0, 0, 0, 0, 0, 0, -1, 0x100);
let r = sse2::_mm_packus_epi16(a, b);
let r = unsafe { sse2::_mm_packus_epi16(a, b) };
assert_eq!(r, u8x16::new(
0xFF, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0xFF));
@@ -2596,13 +2852,15 @@ fn _mm_packus_epi16() {
#[simd_test = "sse2"]
fn _mm_extract_epi16() {
let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
assert_eq!(sse2::_mm_extract_epi16(a, 5), 5);
let r = unsafe { sse2::_mm_extract_epi16(a, 5) };
assert_eq!(r, 5);
}
#[simd_test = "sse2"]
fn _mm_insert_epi16() {
let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
assert_eq!(sse2::_mm_insert_epi16(a, 9, 0), a.replace(0, 9));
let r = unsafe { sse2::_mm_insert_epi16(a, 9, 0) };
assert_eq!(r, a.replace(0, 9));
}
#[simd_test = "sse2"]
@@ -2610,28 +2868,32 @@ fn _mm_movemask_epi8() {
let a = i8x16::from(u8x16::new(
0b1000_0000, 0b0, 0b1000_0000, 0b01, 0b0101, 0b1111_0000, 0, 0,
0, 0, 0b1111_0000, 0b0101, 0b01, 0b1000_0000, 0b0, 0b1000_0000));
assert_eq!(sse2::_mm_movemask_epi8(a), 0b10100100_00100101);
let r = unsafe { sse2::_mm_movemask_epi8(a) };
assert_eq!(r, 0b10100100_00100101);
}
#[simd_test = "sse2"]
fn _mm_shuffle_epi32() {
let a = i32x4::new(5, 10, 15, 20);
let r = unsafe { sse2::_mm_shuffle_epi32(a, 0b00_01_01_11) };
let e = i32x4::new(20, 10, 10, 5);
assert_eq!(sse2::_mm_shuffle_epi32(a, 0b00_01_01_11), e);
assert_eq!(r, e);
}
#[simd_test = "sse2"]
fn _mm_shufflehi_epi16() {
let a = i16x8::new(1, 2, 3, 4, 5, 10, 15, 20);
let r = unsafe { sse2::_mm_shufflehi_epi16(a, 0b00_01_01_11) };
let e = i16x8::new(1, 2, 3, 4, 20, 10, 10, 5);
assert_eq!(sse2::_mm_shufflehi_epi16(a, 0b00_01_01_11), e);
assert_eq!(r, e);
}
#[simd_test = "sse2"]
fn _mm_shufflelo_epi16() {
let a = i16x8::new(5, 10, 15, 20, 1, 2, 3, 4);
let r = unsafe { sse2::_mm_shufflelo_epi16(a, 0b00_01_01_11) };
let e = i16x8::new(20, 10, 10, 5, 1, 2, 3, 4);
assert_eq!(sse2::_mm_shufflelo_epi16(a, 0b00_01_01_11), e);
assert_eq!(r, e);
}
#[simd_test = "sse2"]
@@ -2640,33 +2902,37 @@ fn _mm_unpackhi_epi8() {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
let b = i8x16::new(
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
let r = unsafe { sse2::_mm_unpackhi_epi8(a, b) };
let e = i8x16::new(
8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
assert_eq!(sse2::_mm_unpackhi_epi8(a, b), e);
assert_eq!(r, e);
}
#[simd_test = "sse2"]
fn _mm_unpackhi_epi16() {
let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
let b = i16x8::new(8, 9, 10, 11, 12, 13, 14, 15);
let r = unsafe { sse2::_mm_unpackhi_epi16(a, b) };
let e = i16x8::new(4, 12, 5, 13, 6, 14, 7, 15);
assert_eq!(sse2::_mm_unpackhi_epi16(a, b), e);
assert_eq!(r, e);
}
#[simd_test = "sse2"]
fn _mm_unpackhi_epi32() {
let a = i32x4::new(0, 1, 2, 3);
let b = i32x4::new(4, 5, 6, 7);
let r = unsafe { sse2::_mm_unpackhi_epi32(a, b) };
let e = i32x4::new(2, 6, 3, 7);
assert_eq!(sse2::_mm_unpackhi_epi32(a, b), e);
assert_eq!(r, e);
}
#[simd_test = "sse2"]
fn _mm_unpackhi_epi64() {
let a = i64x2::new(0, 1);
let b = i64x2::new(2, 3);
let r = unsafe { sse2::_mm_unpackhi_epi64(a, b) };
let e = i64x2::new(1, 3);
assert_eq!(sse2::_mm_unpackhi_epi64(a, b), e);
assert_eq!(r, e);
}
#[simd_test = "sse2"]
@@ -2675,131 +2941,147 @@ fn _mm_unpacklo_epi8() {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
let b = i8x16::new(
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
let r = unsafe { sse2::_mm_unpacklo_epi8(a, b) };
let e = i8x16::new(
0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
assert_eq!(sse2::_mm_unpacklo_epi8(a, b), e);
assert_eq!(r, e);
}
#[simd_test = "sse2"]
fn _mm_unpacklo_epi16() {
let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
let b = i16x8::new(8, 9, 10, 11, 12, 13, 14, 15);
let r = unsafe { sse2::_mm_unpacklo_epi16(a, b) };
let e = i16x8::new(0, 8, 1, 9, 2, 10, 3, 11);
assert_eq!(sse2::_mm_unpacklo_epi16(a, b), e);
assert_eq!(r, e);
}
#[simd_test = "sse2"]
fn _mm_unpacklo_epi32() {
let a = i32x4::new(0, 1, 2, 3);
let b = i32x4::new(4, 5, 6, 7);
let r = unsafe { sse2::_mm_unpacklo_epi32(a, b) };
let e = i32x4::new(0, 4, 1, 5);
assert_eq!(sse2::_mm_unpacklo_epi32(a, b), e);
assert_eq!(r, e);
}
#[simd_test = "sse2"]
fn _mm_unpacklo_epi64() {
let a = i64x2::new(0, 1);
let b = i64x2::new(2, 3);
let r = unsafe { sse2::_mm_unpacklo_epi64(a, b) };
let e = i64x2::new(0, 2);
assert_eq!(sse2::_mm_unpacklo_epi64(a, b), e);
assert_eq!(r, e);
}
#[simd_test = "sse2"]
fn _mm_add_sd() {
assert_eq!(
sse2::_mm_add_sd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)),
f64x2::new(6.0, 2.0));
let a = f64x2::new(1.0, 2.0);
let b = f64x2::new(5.0, 10.0);
let r = unsafe { sse2::_mm_add_sd(a, b) };
assert_eq!(r, f64x2::new(6.0, 2.0));
}
#[simd_test = "sse2"]
fn _mm_add_pd() {
assert_eq!(
sse2::_mm_add_pd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)),
f64x2::new(6.0, 12.0));
let a = f64x2::new(1.0, 2.0);
let b = f64x2::new(5.0, 10.0);
let r = unsafe { sse2::_mm_add_pd(a, b) };
assert_eq!(r, f64x2::new(6.0, 12.0));
}
#[simd_test = "sse2"]
fn _mm_div_sd() {
assert_eq!(
sse2::_mm_div_sd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)),
f64x2::new(0.2, 2.0));
let a = f64x2::new(1.0, 2.0);
let b = f64x2::new(5.0, 10.0);
let r = unsafe { sse2::_mm_div_sd(a, b) };
assert_eq!(r, f64x2::new(0.2, 2.0));
}
#[simd_test = "sse2"]
fn _mm_div_pd() {
assert_eq!(
sse2::_mm_div_pd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)),
f64x2::new(0.2, 0.2));
let a = f64x2::new(1.0, 2.0);
let b = f64x2::new(5.0, 10.0);
let r = unsafe { sse2::_mm_div_pd(a, b) };
assert_eq!(r, f64x2::new(0.2, 0.2));
}
#[simd_test = "sse2"]
fn _mm_max_sd() {
assert_eq!(
sse2::_mm_max_sd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)),
f64x2::new(5.0, 2.0));
let a = f64x2::new(1.0, 2.0);
let b = f64x2::new(5.0, 10.0);
let r = unsafe { sse2::_mm_max_sd(a, b) };
assert_eq!(r, f64x2::new(5.0, 2.0));
}
#[simd_test = "sse2"]
fn _mm_max_pd() {
assert_eq!(
sse2::_mm_max_pd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)),
f64x2::new(5.0, 10.0));
let a = f64x2::new(1.0, 2.0);
let b = f64x2::new(5.0, 10.0);
let r = unsafe { sse2::_mm_max_pd(a, b) };
assert_eq!(r, f64x2::new(5.0, 10.0));
}
#[simd_test = "sse2"]
fn _mm_min_sd() {
assert_eq!(
sse2::_mm_min_sd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)),
f64x2::new(1.0, 2.0));
let a = f64x2::new(1.0, 2.0);
let b = f64x2::new(5.0, 10.0);
let r = unsafe { sse2::_mm_min_sd(a, b) };
assert_eq!(r, f64x2::new(1.0, 2.0));
}
#[simd_test = "sse2"]
fn _mm_min_pd() {
assert_eq!(
sse2::_mm_min_pd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)),
f64x2::new(1.0, 2.0));
let a = f64x2::new(1.0, 2.0);
let b = f64x2::new(5.0, 10.0);
let r = unsafe { sse2::_mm_min_pd(a, b) };
assert_eq!(r, f64x2::new(1.0, 2.0));
}
#[simd_test = "sse2"]
fn _mm_mul_sd() {
assert_eq!(
sse2::_mm_mul_sd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)),
f64x2::new(5.0, 2.0));
let a = f64x2::new(1.0, 2.0);
let b = f64x2::new(5.0, 10.0);
let r = unsafe { sse2::_mm_mul_sd(a, b) };
assert_eq!(r, f64x2::new(5.0, 2.0));
}
#[simd_test = "sse2"]
fn _mm_mul_pd() {
assert_eq!(
sse2::_mm_mul_pd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)),
f64x2::new(5.0, 20.0));
let a = f64x2::new(1.0, 2.0);
let b = f64x2::new(5.0, 10.0);
let r = unsafe { sse2::_mm_mul_pd(a, b) };
assert_eq!(r, f64x2::new(5.0, 20.0));
}
#[simd_test = "sse2"]
fn _mm_sqrt_sd() {
assert_eq!(
sse2::_mm_sqrt_sd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)),
f64x2::new(5.0f64.sqrt(), 2.0));
let a = f64x2::new(1.0, 2.0);
let b = f64x2::new(5.0, 10.0);
let r = unsafe { sse2::_mm_sqrt_sd(a, b) };
assert_eq!(r, f64x2::new(5.0f64.sqrt(), 2.0));
}
#[simd_test = "sse2"]
fn _mm_sqrt_pd() {
assert_eq!(
sse2::_mm_sqrt_pd(f64x2::new(1.0, 2.0)),
f64x2::new(1.0f64.sqrt(), 2.0f64.sqrt()));
let r = unsafe { sse2::_mm_sqrt_pd(f64x2::new(1.0, 2.0)) };
assert_eq!(r, f64x2::new(1.0f64.sqrt(), 2.0f64.sqrt()));
}
#[simd_test = "sse2"]
fn _mm_sub_sd() {
assert_eq!(
sse2::_mm_sub_sd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)),
f64x2::new(-4.0, 2.0));
let a = f64x2::new(1.0, 2.0);
let b = f64x2::new(5.0, 10.0);
let r = unsafe { sse2::_mm_sub_sd(a, b) };
assert_eq!(r, f64x2::new(-4.0, 2.0));
}
#[simd_test = "sse2"]
fn _mm_sub_pd() {
assert_eq!(
sse2::_mm_sub_pd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)),
f64x2::new(-4.0, -8.0));
let a = f64x2::new(1.0, 2.0);
let b = f64x2::new(5.0, 10.0);
let r = unsafe { sse2::_mm_sub_pd(a, b) };
assert_eq!(r, f64x2::new(-4.0, -8.0));
}
#[simd_test = "sse2"]
@@ -2809,8 +3091,9 @@ fn _mm_and_pd() {
unsafe {
let a: f64x2 = transmute(i64x2::splat(5));
let b: f64x2 = transmute(i64x2::splat(3));
let r = sse2::_mm_and_pd(a, b);
let e: f64x2 = transmute(i64x2::splat(1));
assert_eq!(sse2::_mm_and_pd(a, b), e);
assert_eq!(r, e);
}
}
@@ -2821,8 +3104,9 @@ fn _mm_andnot_pd() {
unsafe {
let a: f64x2 = transmute(i64x2::splat(5));
let b: f64x2 = transmute(i64x2::splat(3));
let r = sse2::_mm_andnot_pd(a, b);
let e: f64x2 = transmute(i64x2::splat(2));
assert_eq!(sse2::_mm_andnot_pd(a, b), e);
assert_eq!(r, e);
}
}
@@ -2833,8 +3117,9 @@ fn _mm_or_pd() {
unsafe {
let a: f64x2 = transmute(i64x2::splat(5));
let b: f64x2 = transmute(i64x2::splat(3));
let r = sse2::_mm_or_pd(a, b);
let e: f64x2 = transmute(i64x2::splat(7));
assert_eq!(sse2::_mm_or_pd(a, b), e);
assert_eq!(r, e);
}
}
@@ -2845,8 +3130,9 @@ fn _mm_xor_pd() {
unsafe {
let a: f64x2 = transmute(i64x2::splat(5));
let b: f64x2 = transmute(i64x2::splat(3));
let r = sse2::_mm_xor_pd(a, b);
let e: f64x2 = transmute(i64x2::splat(6));
assert_eq!(sse2::_mm_xor_pd(a, b), e);
assert_eq!(r, e);
}
}
@@ -3147,40 +3433,40 @@ fn _mm_comieq_sd() {
use std::f64::NAN;
let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0));
assert!(sse2::_mm_comieq_sd(a, b));
assert!(unsafe { sse2::_mm_comieq_sd(a, b) });
let (a, b) = (f64x2::new(NAN, 2.0), f64x2::new(1.0, 3.0));
assert!(!sse2::_mm_comieq_sd(a, b));
assert!(unsafe { !sse2::_mm_comieq_sd(a, b) });
}
#[simd_test = "sse2"]
fn _mm_comilt_sd() {
let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0));
assert!(!sse2::_mm_comilt_sd(a, b));
assert!(unsafe { !sse2::_mm_comilt_sd(a, b) });
}
#[simd_test = "sse2"]
fn _mm_comile_sd() {
let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0));
assert!(sse2::_mm_comile_sd(a, b));
assert!(unsafe { sse2::_mm_comile_sd(a, b) });
}
#[simd_test = "sse2"]
fn _mm_comigt_sd() {
let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0));
assert!(!sse2::_mm_comigt_sd(a, b));
assert!(unsafe { !sse2::_mm_comigt_sd(a, b) });
}
#[simd_test = "sse2"]
fn _mm_comige_sd() {
let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0));
assert!(sse2::_mm_comige_sd(a, b));
assert!(unsafe { sse2::_mm_comige_sd(a, b) });
}
#[simd_test = "sse2"]
fn _mm_comineq_sd() {
let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0));
assert!(!sse2::_mm_comineq_sd(a, b));
assert!(unsafe { !sse2::_mm_comineq_sd(a, b) });
}
#[simd_test = "sse2"]
@@ -3188,48 +3474,48 @@ fn _mm_ucomieq_sd() {
use std::f64::NAN;
let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0));
assert!(sse2::_mm_ucomieq_sd(a, b));
assert!(unsafe { sse2::_mm_ucomieq_sd(a, b) });
let (a, b) = (f64x2::new(NAN, 2.0), f64x2::new(NAN, 3.0));
assert!(!sse2::_mm_ucomieq_sd(a, b));
assert!(unsafe { !sse2::_mm_ucomieq_sd(a, b) });
}
#[simd_test = "sse2"]
fn _mm_ucomilt_sd() {
let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0));
assert!(!sse2::_mm_ucomilt_sd(a, b));
assert!(unsafe { !sse2::_mm_ucomilt_sd(a, b) });
}
#[simd_test = "sse2"]
fn _mm_ucomile_sd() {
let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0));
assert!(sse2::_mm_ucomile_sd(a, b));
assert!(unsafe { sse2::_mm_ucomile_sd(a, b) });
}
#[simd_test = "sse2"]
fn _mm_ucomigt_sd() {
let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0));
assert!(!sse2::_mm_ucomigt_sd(a, b));
assert!(unsafe { !sse2::_mm_ucomigt_sd(a, b) });
}
#[simd_test = "sse2"]
fn _mm_ucomige_sd() {
let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0));
assert!(sse2::_mm_ucomige_sd(a, b));
assert!(unsafe { sse2::_mm_ucomige_sd(a, b) });
}
#[simd_test = "sse2"]
fn _mm_ucomineq_sd() {
let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0));
assert!(!sse2::_mm_ucomineq_sd(a, b));
assert!(unsafe { !sse2::_mm_ucomineq_sd(a, b) });
}
#[simd_test = "sse2"]
fn _mm_movemask_pd() {
let r = sse2::_mm_movemask_pd(f64x2::new(-1.0, 5.0));
let r = unsafe { sse2::_mm_movemask_pd(f64x2::new(-1.0, 5.0)) };
assert_eq!(r, 0b01);
let r = sse2::_mm_movemask_pd(f64x2::new(-1.0, -5.0));
let r = unsafe { sse2::_mm_movemask_pd(f64x2::new(-1.0, -5.0)) };
assert_eq!(r, 0b11);
}
}
+26 -16
View File
@@ -1,18 +1,18 @@
use v128::*;
use x86::__m128i;
#[cfg(test)]
use stdsimd_test::assert_instr;
use v128::*;
use x86::__m128i;
#[inline(always)]
#[target_feature = "+sse4.1"]
#[cfg_attr(test, assert_instr(pblendvb))]
pub fn _mm_blendv_epi8(
pub unsafe fn _mm_blendv_epi8(
a: __m128i,
b: __m128i,
mask: __m128i,
) -> __m128i {
unsafe { pblendvb(a, b, mask) }
pblendvb(a, b, mask)
}
/// Returns the dot product of two f64x2 vectors.
@@ -24,15 +24,20 @@ pub fn _mm_blendv_epi8(
/// the broadcast mask bit is zero then the return component will be zero.
#[inline(always)]
#[target_feature = "+sse4.1"]
pub fn _mm_dp_pd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 {
pub unsafe fn _mm_dp_pd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 {
macro_rules! call {
($imm8:expr) => {
unsafe { dppd(a, b, $imm8) }
}
($imm8:expr) => { dppd(a, b, $imm8) }
}
constify_imm8!(imm8, call)
}
#[cfg(test)]
#[target_feature = "+sse4.1"]
#[cfg_attr(test, assert_instr(dppd))]
fn _test_mm_dp_pd(a: f64x2, b: f64x2) -> f64x2 {
unsafe { _mm_dp_pd(a, b, 0) }
}
/// Returns the dot product of two f32x4 vectors.
///
/// `imm8[3:0]` is the broadcast mask, and `imm8[7:4]` is the condition mask.
@@ -42,15 +47,20 @@ macro_rules! call {
/// the broadcast mask bit is zero then the return component will be zero.
#[inline(always)]
#[target_feature = "+sse4.1"]
pub fn _mm_dp_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 {
pub unsafe fn _mm_dp_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 {
macro_rules! call {
($imm8:expr) => {
unsafe { dpps(a, b, $imm8) }
}
($imm8:expr) => { dpps(a, b, $imm8) }
}
constify_imm8!(imm8, call)
}
#[cfg(test)]
#[target_feature = "+sse4.1"]
#[cfg_attr(test, assert_instr(dpps))]
fn _test_mm_dp_ps(a: f32x4, b: f32x4) -> f32x4 {
unsafe { _mm_dp_ps(a, b, 0) }
}
#[allow(improper_ctypes)]
extern {
#[link_name = "llvm.x86.sse41.pblendvb"]
@@ -78,7 +88,7 @@ fn _mm_blendv_epi8() {
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
let e = i8x16::new(
0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31);
assert_eq!(sse41::_mm_blendv_epi8(a, b, mask), e);
assert_eq!(unsafe { sse41::_mm_blendv_epi8(a, b, mask) }, e);
}
#[simd_test = "sse4.1"]
@@ -86,7 +96,7 @@ fn _mm_dp_pd() {
let a = f64x2::new(2.0, 3.0);
let b = f64x2::new(1.0, 4.0);
let e = f64x2::new(14.0, 0.0);
assert_eq!(sse41::_mm_dp_pd(a, b, 0b00110001), e);
assert_eq!(unsafe { sse41::_mm_dp_pd(a, b, 0b00110001) }, e);
}
#[simd_test = "sse4.1"]
@@ -94,6 +104,6 @@ fn _mm_dp_ps() {
let a = f32x4::new(2.0, 3.0, 1.0, 10.0);
let b = f32x4::new(1.0, 4.0, 0.5, 10.0);
let e = f32x4::new(14.5, 0.0, 14.5, 0.0);
assert_eq!(sse41::_mm_dp_ps(a, b, 0b01110101), e);
assert_eq!(unsafe { sse41::_mm_dp_ps(a, b, 0b01110101) }, e);
}
}
+16 -6
View File
@@ -1,3 +1,6 @@
#[cfg(test)]
use stdsimd_test::assert_instr;
use x86::__m128i;
pub const _SIDD_UBYTE_OPS: i8 = 0b00000000;
@@ -19,7 +22,7 @@
#[inline(always)]
#[target_feature = "+sse4.2"]
pub fn _mm_cmpestri(
pub unsafe fn _mm_cmpestri(
a: __m128i,
la: i32,
b: __m128i,
@@ -27,13 +30,18 @@ pub fn _mm_cmpestri(
imm8: i8,
) -> i32 {
macro_rules! call {
($imm8:expr) => {
unsafe { pcmpestri128(a, la, b, lb, $imm8) }
}
($imm8:expr) => { pcmpestri128(a, la, b, lb, $imm8) }
}
constify_imm8!(imm8, call)
}
#[cfg(test)]
#[target_feature = "+sse4.2"]
#[cfg_attr(test, assert_instr(pcmpestri))]
fn _test_mm_cmpestri(a: __m128i, la: i32, b: __m128i, lb: i32) -> i32 {
unsafe { _mm_cmpestri(a, la, b, lb, 0) }
}
#[allow(improper_ctypes)]
extern {
#[link_name = "llvm.x86.sse42.pcmpestri128"]
@@ -53,8 +61,10 @@ fn _mm_cmpestri() {
let b = &b"foobar "[..];
let va = __m128i::from(u8x16::load(a, 0));
let vb = __m128i::from(u8x16::load(b, 0));
let i = sse42::_mm_cmpestri(
va, 3, vb, 6, sse42::_SIDD_CMP_EQUAL_ORDERED);
let i = unsafe {
sse42::_mm_cmpestri(
va, 3, vb, 6, sse42::_SIDD_CMP_EQUAL_ORDERED)
};
assert_eq!(3, i);
}
}
+26 -12
View File
@@ -1,15 +1,15 @@
use v128::*;
#[cfg(test)]
use stdsimd_test::assert_instr;
use v128::*;
/// Compute the absolute value of packed 8-bit signed integers in `a` and
/// return the unsigned results.
#[inline(always)]
#[target_feature = "+ssse3"]
#[cfg_attr(test, assert_instr(pabsb))]
pub fn _mm_abs_epi8(a: i8x16) -> u8x16 {
unsafe { pabsb128(a) }
pub unsafe fn _mm_abs_epi8(a: i8x16) -> u8x16 {
pabsb128(a)
}
/// Shuffle bytes from `a` according to the content of `b`.
@@ -39,8 +39,8 @@ pub fn _mm_abs_epi8(a: i8x16) -> u8x16 {
#[inline(always)]
#[target_feature = "+ssse3"]
#[cfg_attr(test, assert_instr(pshufb))]
pub fn _mm_shuffle_epi8(a: u8x16, b: u8x16) -> u8x16 {
unsafe { pshufb128(a, b) }
pub unsafe fn _mm_shuffle_epi8(a: u8x16, b: u8x16) -> u8x16 {
pshufb128(a, b)
}
@@ -48,7 +48,6 @@ pub fn _mm_shuffle_epi8(a: u8x16, b: u8x16) -> u8x16 {
extern {
#[link_name = "llvm.x86.ssse3.pabs.b.128"]
fn pabsb128(a: i8x16) -> u8x16;
#[link_name = "llvm.x86.ssse3.pshuf.b.128"]
fn pshufb128(a: u8x16, b: u8x16) -> u8x16;
}
@@ -62,16 +61,31 @@ mod tests {
#[simd_test = "ssse3"]
fn _mm_abs_epi8() {
let r = ssse3::_mm_abs_epi8(i8x16::splat(-5));
let r = unsafe { ssse3::_mm_abs_epi8(i8x16::splat(-5)) };
assert_eq!(r, u8x16::splat(5));
}
#[simd_test = "ssse3"]
fn _mm_shuffle_epi8() {
let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
let b = u8x16::new(4, 128, 4, 3, 24, 12, 6, 19, 12, 5, 5, 10, 4, 1, 8, 0);
let expected = u8x16::new(5, 0, 5, 4, 9, 13, 7, 4, 13, 6, 6, 11, 5, 2, 9, 1);
let r = ssse3::_mm_shuffle_epi8(a, b);
let a = u8x16::new(
1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
13, 14, 15, 16,
);
let b = u8x16::new(
4, 128, 4, 3,
24, 12, 6, 19,
12, 5, 5, 10,
4, 1, 8, 0,
);
let expected = u8x16::new(
5, 0, 5, 4,
9, 13, 7, 4,
13, 6, 6, 11,
5, 2, 9, 1,
);
let r = unsafe { ssse3::_mm_shuffle_epi8(a, b) };
assert_eq!(r, expected);
}
}
+110 -58
View File
@@ -65,7 +65,7 @@ pub fn _bextr2_u64(a: u64, control: u64) -> u64 {
#[inline(always)]
#[target_feature = "+tbm"]
#[cfg_attr(test, assert_instr(blcfill))]
pub fn _blcfill_u32(x: u32) -> u32 {
pub unsafe fn _blcfill_u32(x: u32) -> u32 {
x & (x.wrapping_add(1))
}
@@ -76,7 +76,7 @@ pub fn _blcfill_u32(x: u32) -> u32 {
#[target_feature = "+tbm"]
#[cfg_attr(test, assert_instr(blcfill))]
#[cfg(not(target_arch = "x86"))] // generates lots of instructions
pub fn _blcfill_u64(x: u64) -> u64 {
pub unsafe fn _blcfill_u64(x: u64) -> u64 {
x & (x.wrapping_add(1))
}
@@ -86,7 +86,7 @@ pub fn _blcfill_u64(x: u64) -> u64 {
#[inline(always)]
#[target_feature = "+tbm"]
#[cfg_attr(test, assert_instr(blci))]
pub fn _blci_u32(x: u32) -> u32 {
pub unsafe fn _blci_u32(x: u32) -> u32 {
x | !(x.wrapping_add(1))
}
@@ -97,7 +97,7 @@ pub fn _blci_u32(x: u32) -> u32 {
#[target_feature = "+tbm"]
#[cfg_attr(test, assert_instr(blci))]
#[cfg(not(target_arch = "x86"))] // generates lots of instructions
pub fn _blci_u64(x: u64) -> u64 {
pub unsafe fn _blci_u64(x: u64) -> u64 {
x | !(x.wrapping_add(1))
}
@@ -107,7 +107,7 @@ pub fn _blci_u64(x: u64) -> u64 {
#[inline(always)]
#[target_feature = "+tbm"]
#[cfg_attr(test, assert_instr(blcic))]
pub fn _blcic_u32(x: u32) -> u32 {
pub unsafe fn _blcic_u32(x: u32) -> u32 {
!x & (x.wrapping_add(1))
}
@@ -118,7 +118,7 @@ pub fn _blcic_u32(x: u32) -> u32 {
#[target_feature = "+tbm"]
#[cfg_attr(test, assert_instr(blcic))]
#[cfg(not(target_arch = "x86"))] // generates lots of instructions
pub fn _blcic_u64(x: u64) -> u64 {
pub unsafe fn _blcic_u64(x: u64) -> u64 {
!x & (x.wrapping_add(1))
}
@@ -128,7 +128,7 @@ pub fn _blcic_u64(x: u64) -> u64 {
#[inline(always)]
#[target_feature = "+tbm"]
#[cfg_attr(test, assert_instr(blcmsk))]
pub fn _blcmsk_u32(x: u32) -> u32 {
pub unsafe fn _blcmsk_u32(x: u32) -> u32 {
x ^ (x.wrapping_add(1))
}
@@ -139,7 +139,7 @@ pub fn _blcmsk_u32(x: u32) -> u32 {
#[target_feature = "+tbm"]
#[cfg_attr(test, assert_instr(blcmsk))]
#[cfg(not(target_arch = "x86"))] // generates lots of instructions
pub fn _blcmsk_u64(x: u64) -> u64 {
pub unsafe fn _blcmsk_u64(x: u64) -> u64 {
x ^ (x.wrapping_add(1))
}
@@ -149,7 +149,7 @@ pub fn _blcmsk_u64(x: u64) -> u64 {
#[inline(always)]
#[target_feature = "+tbm"]
#[cfg_attr(test, assert_instr(blcs))]
pub fn _blcs_u32(x: u32) -> u32 {
pub unsafe fn _blcs_u32(x: u32) -> u32 {
x | (x.wrapping_add(1))
}
@@ -160,7 +160,7 @@ pub fn _blcs_u32(x: u32) -> u32 {
#[target_feature = "+tbm"]
#[cfg_attr(test, assert_instr(blcs))]
#[cfg(not(target_arch = "x86"))] // generates lots of instructions
pub fn _blcs_u64(x: u64) -> u64 {
pub unsafe fn _blcs_u64(x: u64) -> u64 {
x | x.wrapping_add(1)
}
@@ -170,7 +170,7 @@ pub fn _blcs_u64(x: u64) -> u64 {
#[inline(always)]
#[target_feature = "+tbm"]
#[cfg_attr(test, assert_instr(blsfill))]
pub fn _blsfill_u32(x: u32) -> u32 {
pub unsafe fn _blsfill_u32(x: u32) -> u32 {
x | (x.wrapping_sub(1))
}
@@ -181,7 +181,7 @@ pub fn _blsfill_u32(x: u32) -> u32 {
#[target_feature = "+tbm"]
#[cfg_attr(test, assert_instr(blsfill))]
#[cfg(not(target_arch = "x86"))] // generates lots of instructions
pub fn _blsfill_u64(x: u64) -> u64 {
pub unsafe fn _blsfill_u64(x: u64) -> u64 {
x | (x.wrapping_sub(1))
}
@@ -191,7 +191,7 @@ pub fn _blsfill_u64(x: u64) -> u64 {
#[inline(always)]
#[target_feature = "+tbm"]
#[cfg_attr(test, assert_instr(blsic))]
pub fn _blsic_u32(x: u32) -> u32 {
pub unsafe fn _blsic_u32(x: u32) -> u32 {
!x | (x.wrapping_sub(1))
}
@@ -202,7 +202,7 @@ pub fn _blsic_u32(x: u32) -> u32 {
#[target_feature = "+tbm"]
#[cfg_attr(test, assert_instr(blsic))]
#[cfg(not(target_arch = "x86"))] // generates lots of instructions
pub fn _blsic_u64(x: u64) -> u64 {
pub unsafe fn _blsic_u64(x: u64) -> u64 {
!x | (x.wrapping_sub(1))
}
@@ -213,7 +213,7 @@ pub fn _blsic_u64(x: u64) -> u64 {
#[inline(always)]
#[target_feature = "+tbm"]
#[cfg_attr(test, assert_instr(t1mskc))]
pub fn _t1mskc_u32(x: u32) -> u32 {
pub unsafe fn _t1mskc_u32(x: u32) -> u32 {
!x | (x.wrapping_add(1))
}
@@ -225,7 +225,7 @@ pub fn _t1mskc_u32(x: u32) -> u32 {
#[target_feature = "+tbm"]
#[cfg_attr(test, assert_instr(t1mskc))]
#[cfg(not(target_arch = "x86"))] // generates lots of instructions
pub fn _t1mskc_u64(x: u64) -> u64 {
pub unsafe fn _t1mskc_u64(x: u64) -> u64 {
!x | (x.wrapping_add(1))
}
@@ -236,7 +236,7 @@ pub fn _t1mskc_u64(x: u64) -> u64 {
#[inline(always)]
#[target_feature = "+tbm"]
#[cfg_attr(test, assert_instr(tzmsk))]
pub fn _tzmsk_u32(x: u32) -> u32 {
pub unsafe fn _tzmsk_u32(x: u32) -> u32 {
!x & (x.wrapping_sub(1))
}
@@ -248,7 +248,7 @@ pub fn _tzmsk_u32(x: u32) -> u32 {
#[target_feature = "+tbm"]
#[cfg_attr(test, assert_instr(tzmsk))]
#[cfg(not(target_arch = "x86"))] // generates lots of instructions
pub fn _tzmsk_u64(x: u64) -> u64 {
pub unsafe fn _tzmsk_u64(x: u64) -> u64 {
!x & (x.wrapping_sub(1))
}
@@ -272,122 +272,174 @@ fn _bextr_u64() {
#[simd_test = "tbm"]
fn _blcfill_u32() {
assert_eq!(tbm::_blcfill_u32(0b0101_0111u32), 0b0101_0000u32);
assert_eq!(tbm::_blcfill_u32(0b1111_1111u32), 0u32);
assert_eq!(
unsafe { tbm::_blcfill_u32(0b0101_0111u32) },
0b0101_0000u32);
assert_eq!(
unsafe { tbm::_blcfill_u32(0b1111_1111u32) },
0u32);
}
#[simd_test = "tbm"]
#[cfg(not(target_arch = "x86"))]
fn _blcfill_u64() {
assert_eq!(tbm::_blcfill_u64(0b0101_0111u64), 0b0101_0000u64);
assert_eq!(tbm::_blcfill_u64(0b1111_1111u64), 0u64);
assert_eq!(
unsafe { tbm::_blcfill_u64(0b0101_0111u64) },
0b0101_0000u64);
assert_eq!(
unsafe { tbm::_blcfill_u64(0b1111_1111u64) },
0u64);
}
#[simd_test = "tbm"]
fn _blci_u32() {
assert_eq!(tbm::_blci_u32(0b0101_0000u32),
0b1111_1111_1111_1111_1111_1111_1111_1110u32);
assert_eq!(tbm::_blci_u32(0b1111_1111u32),
0b1111_1111_1111_1111_1111_1110_1111_1111u32);
assert_eq!(
unsafe { tbm::_blci_u32(0b0101_0000u32) },
0b1111_1111_1111_1111_1111_1111_1111_1110u32);
assert_eq!(
unsafe { tbm::_blci_u32(0b1111_1111u32) },
0b1111_1111_1111_1111_1111_1110_1111_1111u32);
}
#[simd_test = "tbm"]
#[cfg(not(target_arch = "x86"))]
fn _blci_u64() {
assert_eq!(tbm::_blci_u64(0b0101_0000u64),
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110u64);
assert_eq!(tbm::_blci_u64(0b1111_1111u64),
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110_1111_1111u64);
assert_eq!(
unsafe { tbm::_blci_u64(0b0101_0000u64) },
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110u64);
assert_eq!(
unsafe { tbm::_blci_u64(0b1111_1111u64) },
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110_1111_1111u64);
}
#[simd_test = "tbm"]
fn _blcic_u32() {
assert_eq!(tbm::_blcic_u32(0b0101_0001u32), 0b0000_0010u32);
assert_eq!(tbm::_blcic_u32(0b1111_1111u32), 0b1_0000_0000u32);
assert_eq!(
unsafe { tbm::_blcic_u32(0b0101_0001u32) },
0b0000_0010u32);
assert_eq!(
unsafe { tbm::_blcic_u32(0b1111_1111u32) },
0b1_0000_0000u32);
}
#[simd_test = "tbm"]
#[cfg(not(target_arch = "x86"))]
fn _blcic_u64() {
assert_eq!(tbm::_blcic_u64(0b0101_0001u64), 0b0000_0010u64);
assert_eq!(tbm::_blcic_u64(0b1111_1111u64), 0b1_0000_0000u64);
assert_eq!(
unsafe { tbm::_blcic_u64(0b0101_0001u64) },
0b0000_0010u64);
assert_eq!(
unsafe { tbm::_blcic_u64(0b1111_1111u64) },
0b1_0000_0000u64);
}
#[simd_test = "tbm"]
fn _blcmsk_u32() {
assert_eq!(tbm::_blcmsk_u32(0b0101_0001u32), 0b0000_0011u32);
assert_eq!(tbm::_blcmsk_u32(0b1111_1111u32), 0b1_1111_1111u32);
assert_eq!(
unsafe { tbm::_blcmsk_u32(0b0101_0001u32) },
0b0000_0011u32);
assert_eq!(
unsafe { tbm::_blcmsk_u32(0b1111_1111u32) },
0b1_1111_1111u32);
}
#[simd_test = "tbm"]
#[cfg(not(target_arch = "x86"))]
fn _blcmsk_u64() {
assert_eq!(tbm::_blcmsk_u64(0b0101_0001u64), 0b0000_0011u64);
assert_eq!(tbm::_blcmsk_u64(0b1111_1111u64), 0b1_1111_1111u64);
assert_eq!(
unsafe { tbm::_blcmsk_u64(0b0101_0001u64) },
0b0000_0011u64);
assert_eq!(
unsafe { tbm::_blcmsk_u64(0b1111_1111u64) },
0b1_1111_1111u64);
}
#[simd_test = "tbm"]
fn _blcs_u32() {
assert_eq!(tbm::_blcs_u32(0b0101_0001u32), 0b0101_0011u32);
assert_eq!(tbm::_blcs_u32(0b1111_1111u32), 0b1_1111_1111u32);
assert_eq!(unsafe { tbm::_blcs_u32(0b0101_0001u32) }, 0b0101_0011u32);
assert_eq!(unsafe { tbm::_blcs_u32(0b1111_1111u32) }, 0b1_1111_1111u32);
}
#[simd_test = "tbm"]
#[cfg(not(target_arch = "x86"))]
fn _blcs_u64() {
assert_eq!(tbm::_blcs_u64(0b0101_0001u64), 0b0101_0011u64);
assert_eq!(tbm::_blcs_u64(0b1111_1111u64), 0b1_1111_1111u64);
assert_eq!(unsafe { tbm::_blcs_u64(0b0101_0001u64) }, 0b0101_0011u64);
assert_eq!(unsafe { tbm::_blcs_u64(0b1111_1111u64) }, 0b1_1111_1111u64);
}
#[simd_test = "tbm"]
fn _blsfill_u32() {
assert_eq!(tbm::_blsfill_u32(0b0101_0100u32), 0b0101_0111u32);
assert_eq!(tbm::_blsfill_u32(0u32), 0b1111_1111_1111_1111_1111_1111_1111_1111u32);
assert_eq!(
unsafe { tbm::_blsfill_u32(0b0101_0100u32) },
0b0101_0111u32);
assert_eq!(
unsafe { tbm::_blsfill_u32(0u32) },
0b1111_1111_1111_1111_1111_1111_1111_1111u32);
}
#[simd_test = "tbm"]
#[cfg(not(target_arch = "x86"))]
fn _blsfill_u64() {
assert_eq!(tbm::_blsfill_u64(0b0101_0100u64), 0b0101_0111u64);
assert_eq!(tbm::_blsfill_u64(0u64), 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64);
assert_eq!(
unsafe { tbm::_blsfill_u64(0b0101_0100u64) },
0b0101_0111u64);
assert_eq!(
unsafe { tbm::_blsfill_u64(0u64) },
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64);
}
#[simd_test = "tbm"]
fn _blsic_u32() {
assert_eq!(tbm::_blsic_u32(0b0101_0100u32), 0b1111_1111_1111_1111_1111_1111_1111_1011u32);
assert_eq!(tbm::_blsic_u32(0u32), 0b1111_1111_1111_1111_1111_1111_1111_1111u32);
assert_eq!(
unsafe { tbm::_blsic_u32(0b0101_0100u32) },
0b1111_1111_1111_1111_1111_1111_1111_1011u32);
assert_eq!(
unsafe { tbm::_blsic_u32(0u32) },
0b1111_1111_1111_1111_1111_1111_1111_1111u32);
}
#[simd_test = "tbm"]
#[cfg(not(target_arch = "x86"))]
fn _blsic_u64() {
assert_eq!(tbm::_blsic_u64(0b0101_0100u64), 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1011u64);
assert_eq!(tbm::_blsic_u64(0u64), 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64);
assert_eq!(
unsafe { tbm::_blsic_u64(0b0101_0100u64) },
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1011u64);
assert_eq!(
unsafe { tbm::_blsic_u64(0u64) },
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64);
}
#[simd_test = "tbm"]
fn _t1mskc_u32() {
assert_eq!(tbm::_t1mskc_u32(0b0101_0111u32), 0b1111_1111_1111_1111_1111_1111_1111_1000u32);
assert_eq!(tbm::_t1mskc_u32(0u32), 0b1111_1111_1111_1111_1111_1111_1111_1111u32);
assert_eq!(
unsafe { tbm::_t1mskc_u32(0b0101_0111u32) },
0b1111_1111_1111_1111_1111_1111_1111_1000u32);
assert_eq!(
unsafe { tbm::_t1mskc_u32(0u32) },
0b1111_1111_1111_1111_1111_1111_1111_1111u32);
}
#[simd_test = "tbm"]
#[cfg(not(target_arch = "x86"))]
fn _t1mksc_u64() {
assert_eq!(tbm::_t1mskc_u64(0b0101_0111u64), 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1000u64);
assert_eq!(tbm::_t1mskc_u64(0u64), 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64);
assert_eq!(
unsafe { tbm::_t1mskc_u64(0b0101_0111u64) },
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1000u64);
assert_eq!(
unsafe { tbm::_t1mskc_u64(0u64) },
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64);
}
#[simd_test = "tbm"]
fn _tzmsk_u32() {
assert_eq!(tbm::_tzmsk_u32(0b0101_1000u32), 0b0000_0111u32);
assert_eq!(tbm::_tzmsk_u32(0b0101_1001u32), 0b0000_0000u32);
assert_eq!(unsafe { tbm::_tzmsk_u32(0b0101_1000u32) }, 0b0000_0111u32);
assert_eq!(unsafe { tbm::_tzmsk_u32(0b0101_1001u32) }, 0b0000_0000u32);
}
#[simd_test = "tbm"]
#[cfg(not(target_arch = "x86"))]
fn _tzmsk_u64() {
assert_eq!(tbm::_tzmsk_u64(0b0101_1000u64), 0b0000_0111u64);
assert_eq!(tbm::_tzmsk_u64(0b0101_1001u64), 0b0000_0000u64);
assert_eq!(unsafe { tbm::_tzmsk_u64(0b0101_1000u64) }, 0b0000_0111u64);
assert_eq!(unsafe { tbm::_tzmsk_u64(0b0101_1001u64) }, 0b0000_0000u64);
}
}