x86: add unsafe to all x86 vendor intrinsics

Also, add missing assert_instr tests to each intrinsic, where possible.
2026-05-28 20:16:58 +03:00 · 2017-09-26 21:53:50 -04:00
parent ff9e960628
commit 6dfc65289c
12 changed files with 1611 additions and 1213 deletions
@@ -24,9 +24,11 @@ fn index(needle: &str, haystack: &str) -> usize {
        haystack.resize(16, 0);
        let vhaystack = vendor::__m128i::from(s::u8x16::load(&haystack, 0));

-        vendor::_mm_cmpestri(
-            vneedle, needle_len as i32, vhaystack, hay_len as i32,
-            vendor::_SIDD_CMP_EQUAL_ORDERED) as usize
+        unsafe {
+            vendor::_mm_cmpestri(
+                vneedle, needle_len as i32, vhaystack, hay_len as i32,
+                vendor::_SIDD_CMP_EQUAL_ORDERED) as usize
+        }
    }

    pub fn main() {
@@ -19,7 +19,7 @@
 #[inline(always)]
 #[target_feature = "+lzcnt"]
 #[cfg_attr(test, assert_instr(lzcnt))]
-pub fn _lzcnt_u32(x: u32) -> u32 { x.leading_zeros() }
+pub unsafe fn _lzcnt_u32(x: u32) -> u32 { x.leading_zeros() }

 /// Counts the leading most significant zero bits.
 ///
@@ -27,19 +27,19 @@ pub fn _lzcnt_u32(x: u32) -> u32 { x.leading_zeros() }
 #[inline(always)]
 #[target_feature = "+lzcnt"]
 #[cfg_attr(test, assert_instr(lzcnt))]
-pub fn _lzcnt_u64(x: u64) -> u64 { x.leading_zeros() as u64 }
+pub unsafe fn _lzcnt_u64(x: u64) -> u64 { x.leading_zeros() as u64 }

 /// Counts the bits that are set.
 #[inline(always)]
 #[target_feature = "+popcnt"]
 #[cfg_attr(test, assert_instr(popcnt))]
-pub fn _popcnt32(x: u32) -> u32 { x.count_ones() }
+pub unsafe fn _popcnt32(x: u32) -> u32 { x.count_ones() }

 /// Counts the bits that are set.
 #[inline(always)]
 #[target_feature = "+popcnt"]
 #[cfg_attr(test, assert_instr(popcnt))]
-pub fn _popcnt64(x: u64) -> u64 { x.count_ones() as u64 }
+pub unsafe fn _popcnt64(x: u64) -> u64 { x.count_ones() as u64 }

 #[cfg(test)]
 mod tests {
@@ -49,21 +49,21 @@ mod tests {

    #[simd_test = "lzcnt"]
    fn _lzcnt_u32() {
-        assert_eq!(abm::_lzcnt_u32(0b0101_1010u32), 25u32);
+        assert_eq!(unsafe { abm::_lzcnt_u32(0b0101_1010u32) }, 25u32);
    }

    #[simd_test = "lzcnt"]
    fn _lzcnt_u64() {
-        assert_eq!(abm::_lzcnt_u64(0b0101_1010u64), 57u64);
+        assert_eq!(unsafe { abm::_lzcnt_u64(0b0101_1010u64) }, 57u64);
    }

    #[simd_test = "popcnt"]
    fn _popcnt32() {
-        assert_eq!(abm::_popcnt32(0b0101_1010u32), 4);
+        assert_eq!(unsafe { abm::_popcnt32(0b0101_1010u32) }, 4);
    }

    #[simd_test = "popcnt"]
    fn _popcnt64() {
-        assert_eq!(abm::_popcnt64(0b0101_1010u64), 4);
+        assert_eq!(unsafe { abm::_popcnt64(0b0101_1010u64) }, 4);
    }
 }
@@ -1,14 +1,14 @@
-use v256::*;
-
 #[cfg(test)]
 use stdsimd_test::assert_instr;

+use v256::*;
+
 /// Add packed double-precision (64-bit) floating-point elements
 /// in `a` and `b`.
 #[inline(always)]
 #[target_feature = "+avx"]
 #[cfg_attr(test, assert_instr(vaddpd))]
-pub fn _mm256_add_pd(a: f64x4, b: f64x4) -> f64x4 {
+pub unsafe fn _mm256_add_pd(a: f64x4, b: f64x4) -> f64x4 {
    a + b
 }

@@ -16,7 +16,7 @@ pub fn _mm256_add_pd(a: f64x4, b: f64x4) -> f64x4 {
 #[inline(always)]
 #[target_feature = "+avx"]
 #[cfg_attr(test, assert_instr(vaddps))]
-pub fn _mm256_add_ps(a: f32x8, b: f32x8) -> f32x8 {
+pub unsafe fn _mm256_add_ps(a: f32x8, b: f32x8) -> f32x8 {
    a + b
 }

@@ -25,7 +25,7 @@ pub fn _mm256_add_ps(a: f32x8, b: f32x8) -> f32x8 {
 #[inline(always)]
 #[target_feature = "+avx"]
 #[cfg_attr(test, assert_instr(vmulpd))]
-pub fn _mm256_mul_pd(a: f64x4, b: f64x4) -> f64x4 {
+pub unsafe fn _mm256_mul_pd(a: f64x4, b: f64x4) -> f64x4 {
    a * b
 }

@@ -33,7 +33,7 @@ pub fn _mm256_mul_pd(a: f64x4, b: f64x4) -> f64x4 {
 #[inline(always)]
 #[target_feature = "+avx"]
 #[cfg_attr(test, assert_instr(vmulps))]
-pub fn _mm256_mul_ps(a: f32x8, b: f32x8) -> f32x8 {
+pub unsafe fn _mm256_mul_ps(a: f32x8, b: f32x8) -> f32x8 {
    a * b
 }

@@ -42,8 +42,8 @@ pub fn _mm256_mul_ps(a: f32x8, b: f32x8) -> f32x8 {
 #[inline(always)]
 #[target_feature = "+avx"]
 #[cfg_attr(test, assert_instr(vaddsubpd))]
-pub fn _mm256_addsub_pd(a: f64x4, b: f64x4) -> f64x4 {
-    unsafe { addsubpd256(a, b) }
+pub unsafe fn _mm256_addsub_pd(a: f64x4, b: f64x4) -> f64x4 {
+    addsubpd256(a, b)
 }

 /// Alternatively add and subtract packed single-precision (32-bit)
@@ -51,8 +51,8 @@ pub fn _mm256_addsub_pd(a: f64x4, b: f64x4) -> f64x4 {
 #[inline(always)]
 #[target_feature = "+avx"]
 #[cfg_attr(test, assert_instr(vaddsubps))]
-pub fn _mm256_addsub_ps(a: f32x8, b: f32x8) -> f32x8 {
-    unsafe { addsubps256(a, b) }
+pub unsafe fn _mm256_addsub_ps(a: f32x8, b: f32x8) -> f32x8 {
+    addsubps256(a, b)
 }

 /// Subtract packed double-precision (64-bit) floating-point elements in `b`
@@ -60,7 +60,7 @@ pub fn _mm256_addsub_ps(a: f32x8, b: f32x8) -> f32x8 {
 #[inline(always)]
 #[target_feature = "+avx"]
 #[cfg_attr(test, assert_instr(vsubpd))]
-pub fn _mm256_sub_pd(a: f64x4, b: f64x4) -> f64x4 {
+pub unsafe fn _mm256_sub_pd(a: f64x4, b: f64x4) -> f64x4 {
    a - b
 }

@@ -69,25 +69,24 @@ pub fn _mm256_sub_pd(a: f64x4, b: f64x4) -> f64x4 {
 #[inline(always)]
 #[target_feature = "+avx"]
 #[cfg_attr(test, assert_instr(vsubps))]
-pub fn _mm256_sub_ps(a: f32x8, b: f32x8) -> f32x8 {
+pub unsafe fn _mm256_sub_ps(a: f32x8, b: f32x8) -> f32x8 {
    a - b
 }

 /// Round packed double-precision (64-bit) floating point elements in `a`
 /// according to the flag `b`. The value of `b` may be as follows:
+///
+/// ```ignore
 /// 0x00: Round to the nearest whole number.
 /// 0x01: Round down, toward negative infinity.
 /// 0x02: Round up, toward positive infinity.
 /// 0x03: Truncate the values.
-/// For a few additional values options, check the LLVM docs:
-/// https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
+/// ```
 #[inline(always)]
 #[target_feature = "+avx"]
-pub fn _mm256_round_pd(a: f64x4, b: i32) -> f64x4 {
+pub unsafe fn _mm256_round_pd(a: f64x4, b: i32) -> f64x4 {
    macro_rules! call {
-        ($imm8:expr) => {
-            unsafe { roundpd256(a, $imm8) }
-        }
+        ($imm8:expr) => { roundpd256(a, $imm8) }
    }
    constify_imm8!(b, call)
 }
@@ -96,7 +95,7 @@ macro_rules! call {
 #[cfg_attr(test, assert_instr(vroundpd))]
 #[target_feature = "+avx"]
 fn test_mm256_round_pd(a: f64x4) -> f64x4 {
-    _mm256_round_pd(a, 0x3)
+    unsafe { _mm256_round_pd(a, 0x3) }
 }

 /// Round packed double-precision (64-bit) floating point elements in `a` toward
@@ -104,8 +103,8 @@ fn test_mm256_round_pd(a: f64x4) -> f64x4 {
 #[inline(always)]
 #[target_feature = "+avx"]
 #[cfg_attr(test, assert_instr(vroundpd))]
-pub fn _mm256_ceil_pd(a: f64x4) -> f64x4 {
-    unsafe { roundpd256(a, 0x02) }
+pub unsafe fn _mm256_ceil_pd(a: f64x4) -> f64x4 {
+    roundpd256(a, 0x02)
 }

 /// Round packed double-precision (64-bit) floating point elements in `a` toward
@@ -113,8 +112,8 @@ pub fn _mm256_ceil_pd(a: f64x4) -> f64x4 {
 #[inline(always)]
 #[target_feature = "+avx"]
 #[cfg_attr(test, assert_instr(vroundpd))]
-pub fn _mm256_floor_pd(a: f64x4) -> f64x4 {
-    unsafe { roundpd256(a, 0x01) }
+pub unsafe fn _mm256_floor_pd(a: f64x4) -> f64x4 {
+    roundpd256(a, 0x01)
 }

 /// LLVM intrinsics used in the above functions
@@ -139,7 +138,7 @@ mod tests {
    fn _mm256_add_pd() {
        let a = f64x4::new(1.0, 2.0, 3.0, 4.0);
        let b = f64x4::new(5.0, 6.0, 7.0, 8.0);
-        let r = avx::_mm256_add_pd(a, b);
+        let r = unsafe { avx::_mm256_add_pd(a, b) };
        let e = f64x4::new(6.0, 8.0, 10.0, 12.0);
        assert_eq!(r, e);
    }
@@ -148,7 +147,7 @@ fn _mm256_add_pd() {
    fn _mm256_add_ps() {
        let a = f32x8::new(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
        let b = f32x8::new(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        let r = avx::_mm256_add_ps(a, b);
+        let r = unsafe { avx::_mm256_add_ps(a, b) };
        let e = f32x8::new(10.0, 12.0, 14.0, 16.0, 18.0, 20.0, 22.0, 24.0);
        assert_eq!(r, e);
    }
@@ -157,7 +156,7 @@ fn _mm256_add_ps() {
    fn _mm256_mul_pd() {
        let a = f64x4::new(1.0, 2.0, 3.0, 4.0);
        let b = f64x4::new(5.0, 6.0, 7.0, 8.0);
-        let r = avx::_mm256_mul_pd(a, b);
+        let r = unsafe { avx::_mm256_mul_pd(a, b) };
        let e = f64x4::new(5.0, 12.0, 21.0, 32.0);
        assert_eq!(r, e);
    }
@@ -166,7 +165,7 @@ fn _mm256_mul_pd() {
    fn _mm256_mul_ps() {
        let a = f32x8::new(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
        let b = f32x8::new(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        let r = avx::_mm256_mul_ps(a, b);
+        let r = unsafe { avx::_mm256_mul_ps(a, b) };
        let e = f32x8::new(9.0, 20.0, 33.0, 48.0, 65.0, 84.0, 105.0, 128.0);
        assert_eq!(r, e);
    }
@@ -175,7 +174,7 @@ fn _mm256_mul_ps() {
    fn _mm256_addsub_pd() {
        let a = f64x4::new(1.0, 2.0, 3.0, 4.0);
        let b = f64x4::new(5.0, 6.0, 7.0, 8.0);
-        let r = avx::_mm256_addsub_pd(a, b);
+        let r = unsafe { avx::_mm256_addsub_pd(a, b) };
        let e = f64x4::new(-4.0, 8.0, -4.0, 12.0);
        assert_eq!(r, e);
    }
@@ -184,7 +183,7 @@ fn _mm256_addsub_pd() {
    fn _mm256_addsub_ps() {
        let a = f32x8::new(1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0);
        let b = f32x8::new(5.0, 6.0, 7.0, 8.0, 5.0, 6.0, 7.0, 8.0);
-        let r = avx::_mm256_addsub_ps(a, b);
+        let r = unsafe { avx::_mm256_addsub_ps(a, b) };
        let e = f32x8::new(-4.0, 8.0, -4.0, 12.0, -4.0, 8.0, -4.0, 12.0);
        assert_eq!(r, e);
    }
@@ -193,7 +192,7 @@ fn _mm256_addsub_ps() {
    fn _mm256_sub_pd() {
        let a = f64x4::new(1.0, 2.0, 3.0, 4.0);
        let b = f64x4::new(5.0, 6.0, 7.0, 8.0);
-        let r = avx::_mm256_sub_pd(a, b);
+        let r = unsafe { avx::_mm256_sub_pd(a, b) };
        let e = f64x4::new(-4.0,-4.0,-4.0,-4.0);
        assert_eq!(r, e);
    }
@@ -202,7 +201,7 @@ fn _mm256_sub_pd() {
    fn _mm256_sub_ps() {
        let a = f32x8::new(1.0, 2.0, 3.0, 4.0, -1.0, -2.0, -3.0, -4.0);
        let b = f32x8::new(5.0, 6.0, 7.0, 8.0, 3.0, 2.0, 1.0, 0.0);
-        let r = avx::_mm256_sub_ps(a, b);
+        let r = unsafe { avx::_mm256_sub_ps(a, b) };
        let e = f32x8::new(-4.0, -4.0, -4.0, -4.0, -4.0, -4.0, -4.0, -4.0);
        assert_eq!(r, e);
    }
@@ -210,9 +209,9 @@ fn _mm256_sub_ps() {
    #[simd_test = "avx"]
    fn _mm256_round_pd() {
        let a = f64x4::new(1.55, 2.2, 3.99, -1.2);
-        let result_closest = avx::_mm256_round_pd(a, 0b00000000);
-        let result_down = avx::_mm256_round_pd(a, 0b00000001);
-        let result_up = avx::_mm256_round_pd(a, 0b00000010);
+        let result_closest = unsafe { avx::_mm256_round_pd(a, 0b00000000) };
+        let result_down = unsafe { avx::_mm256_round_pd(a, 0b00000001) };
+        let result_up = unsafe { avx::_mm256_round_pd(a, 0b00000010) };
        let expected_closest = f64x4::new(2.0, 2.0, 4.0, -1.0);
        let expected_down = f64x4::new(1.0, 2.0, 3.0, -2.0);
        let expected_up = f64x4::new(2.0, 3.0, 4.0, -1.0);
@@ -224,7 +223,7 @@ fn _mm256_round_pd() {
    #[simd_test = "avx"]
    fn _mm256_floor_pd() {
        let a = f64x4::new(1.55, 2.2, 3.99, -1.2);
-        let result_down = avx::_mm256_floor_pd(a);
+        let result_down = unsafe { avx::_mm256_floor_pd(a) };
        let expected_down = f64x4::new(1.0, 2.0, 3.0, -2.0);
        assert_eq!(result_down, expected_down);
    }
@@ -232,7 +231,7 @@ fn _mm256_floor_pd() {
    #[simd_test = "avx"]
    fn _mm256_ceil_pd() {
        let a = f64x4::new(1.55, 2.2, 3.99, -1.2);
-        let result_up = avx::_mm256_ceil_pd(a, );
+        let result_up = unsafe { avx::_mm256_ceil_pd(a) };
        let expected_up = f64x4::new(2.0, 3.0, 4.0, -1.0);
        assert_eq!(result_up, expected_up);
    }
@@ -9,31 +9,31 @@
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpabsd))]
-pub fn _mm256_abs_epi32(a: i32x8) -> i32x8 {
-    unsafe { pabsd(a) }
+pub unsafe fn _mm256_abs_epi32(a: i32x8) -> i32x8 {
+    pabsd(a)
 }

 /// Computes the absolute values of packed 16-bit integers in `a`.
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpabsw))]
-pub fn _mm256_abs_epi16(a: i16x16) -> i16x16 {
-    unsafe { pabsw(a) }
+pub unsafe fn _mm256_abs_epi16(a: i16x16) -> i16x16 {
+    pabsw(a)
 }

 /// Computes the absolute values of packed 8-bit integers in `a`.
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpabsb))]
-pub fn _mm256_abs_epi8(a: i8x32) -> i8x32 {
-    unsafe { pabsb(a) }
+pub unsafe fn _mm256_abs_epi8(a: i8x32) -> i8x32 {
+    pabsb(a)
 }

 /// Add packed 64-bit integers in `a` and `b`.
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpaddq))]
-pub fn _mm256_add_epi64(a: i64x4, b: i64x4) -> i64x4 {
+pub unsafe fn _mm256_add_epi64(a: i64x4, b: i64x4) -> i64x4 {
    a + b
 }

@@ -41,7 +41,7 @@ pub fn _mm256_add_epi64(a: i64x4, b: i64x4) -> i64x4 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpaddd))]
-pub fn _mm256_add_epi32(a: i32x8, b: i32x8) -> i32x8 {
+pub unsafe fn _mm256_add_epi32(a: i32x8, b: i32x8) -> i32x8 {
    a + b
 }

@@ -49,7 +49,7 @@ pub fn _mm256_add_epi32(a: i32x8, b: i32x8) -> i32x8 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpaddw))]
-pub fn _mm256_add_epi16(a: i16x16, b: i16x16) -> i16x16 {
+pub unsafe fn _mm256_add_epi16(a: i16x16, b: i16x16) -> i16x16 {
    a + b
 }

@@ -57,7 +57,7 @@ pub fn _mm256_add_epi16(a: i16x16, b: i16x16) -> i16x16 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpaddb))]
-pub fn _mm256_add_epi8(a: i8x32, b: i8x32) -> i8x32 {
+pub unsafe fn _mm256_add_epi8(a: i8x32, b: i8x32) -> i8x32 {
    a + b
 }

@@ -65,32 +65,32 @@ pub fn _mm256_add_epi8(a: i8x32, b: i8x32) -> i8x32 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpaddsb))]
-pub fn _mm256_adds_epi8(a: i8x32, b: i8x32) -> i8x32 {
-    unsafe { paddsb(a, b) }
+pub unsafe fn _mm256_adds_epi8(a: i8x32, b: i8x32) -> i8x32 {
+    paddsb(a, b)
 }

 /// Add packed 16-bit integers in `a` and `b` using saturation.
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpaddsw))]
-pub fn _mm256_adds_epi16(a: i16x16, b: i16x16) -> i16x16 {
-    unsafe { paddsw(a, b) }
+pub unsafe fn _mm256_adds_epi16(a: i16x16, b: i16x16) -> i16x16 {
+    paddsw(a, b)
 }

 /// Add packed unsigned 8-bit integers in `a` and `b` using saturation.
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpaddusb))]
-pub fn _mm256_adds_epu8(a: u8x32, b: u8x32) -> u8x32 {
-    unsafe { paddusb(a, b) }
+pub unsafe fn _mm256_adds_epu8(a: u8x32, b: u8x32) -> u8x32 {
+    paddusb(a, b)
 }

 /// Add packed unsigned 16-bit integers in `a` and `b` using saturation.
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpaddusw))]
-pub fn _mm256_adds_epu16(a: u16x16, b: u16x16) -> u16x16 {
-    unsafe { paddusw(a, b) }
+pub unsafe fn _mm256_adds_epu16(a: u16x16, b: u16x16) -> u16x16 {
+    paddusw(a, b)
 }

 // TODO _mm256_alignr_epi8
@@ -100,7 +100,7 @@ pub fn _mm256_adds_epu16(a: u16x16, b: u16x16) -> u16x16 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vandps))]
-pub fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
+pub unsafe fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
    a & b
 }

@@ -109,7 +109,7 @@ pub fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vandnps))]
-pub fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
+pub unsafe fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
    (!a) & b
 }

@@ -117,16 +117,16 @@ pub fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpavgw))]
-pub fn _mm256_avg_epu16 (a: u16x16, b: u16x16) -> u16x16 {
-    unsafe { pavgw(a, b) }
+pub unsafe fn _mm256_avg_epu16 (a: u16x16, b: u16x16) -> u16x16 {
+    pavgw(a, b)
 }

 /// Average packed unsigned 8-bit integers in `a` and `b`.
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpavgb))]
-pub fn _mm256_avg_epu8 (a: u8x32, b: u8x32) -> u8x32 {
-    unsafe { pavgb(a, b) }
+pub unsafe fn _mm256_avg_epu8 (a: u8x32, b: u8x32) -> u8x32 {
+    pavgb(a, b)
 }

 // TODO _mm256_blend_epi16
@@ -137,8 +137,8 @@ pub fn _mm256_avg_epu8 (a: u8x32, b: u8x32) -> u8x32 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpblendvb))]
-pub fn _mm256_blendv_epi8(a:i8x32,b:i8x32,mask:__m256i) -> i8x32 {
-    unsafe { pblendvb(a,b,mask) }
+pub unsafe fn _mm256_blendv_epi8(a:i8x32,b:i8x32,mask:__m256i) -> i8x32 {
+    pblendvb(a,b,mask)
 }

 // TODO _mm_broadcastb_epi8
@@ -158,12 +158,11 @@ pub fn _mm256_blendv_epi8(a:i8x32,b:i8x32,mask:__m256i) -> i8x32 {
 // TODO _mm256_bslli_epi128
 // TODO _mm256_bsrli_epi128

-
 /// Compare packed 64-bit integers in `a` and `b` for equality.
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpcmpeqq))]
-pub fn _mm256_cmpeq_epi64(a: i64x4, b: i64x4) -> i64x4 {
+pub unsafe fn _mm256_cmpeq_epi64(a: i64x4, b: i64x4) -> i64x4 {
    a.eq(b)
 }

@@ -171,7 +170,7 @@ pub fn _mm256_cmpeq_epi64(a: i64x4, b: i64x4) -> i64x4 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpcmpeqd))]
-pub fn _mm256_cmpeq_epi32(a: i32x8, b: i32x8) -> i32x8 {
+pub unsafe fn _mm256_cmpeq_epi32(a: i32x8, b: i32x8) -> i32x8 {
    a.eq(b)
 }

@@ -179,7 +178,7 @@ pub fn _mm256_cmpeq_epi32(a: i32x8, b: i32x8) -> i32x8 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpcmpeqw))]
-pub fn _mm256_cmpeq_epi16(a: i16x16, b: i16x16) -> i16x16 {
+pub unsafe fn _mm256_cmpeq_epi16(a: i16x16, b: i16x16) -> i16x16 {
    a.eq(b)
 }

@@ -187,7 +186,7 @@ pub fn _mm256_cmpeq_epi16(a: i16x16, b: i16x16) -> i16x16 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpcmpeqb))]
-pub fn _mm256_cmpeq_epi8(a: i8x32, b: i8x32) -> i8x32 {
+pub unsafe fn _mm256_cmpeq_epi8(a: i8x32, b: i8x32) -> i8x32 {
    a.eq(b)
 }

@@ -195,7 +194,7 @@ pub fn _mm256_cmpeq_epi8(a: i8x32, b: i8x32) -> i8x32 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpcmpgtq))]
-pub fn _mm256_cmpgt_epi64(a: i64x4, b: i64x4) -> i64x4 {
+pub unsafe fn _mm256_cmpgt_epi64(a: i64x4, b: i64x4) -> i64x4 {
    a.gt(b)
 }

@@ -203,7 +202,7 @@ pub fn _mm256_cmpgt_epi64(a: i64x4, b: i64x4) -> i64x4 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpcmpgtd))]
-pub fn _mm256_cmpgt_epi32(a: i32x8, b: i32x8) -> i32x8 {
+pub unsafe fn _mm256_cmpgt_epi32(a: i32x8, b: i32x8) -> i32x8 {
    a.gt(b)
 }

@@ -211,7 +210,7 @@ pub fn _mm256_cmpgt_epi32(a: i32x8, b: i32x8) -> i32x8 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpcmpgtw))]
-pub fn _mm256_cmpgt_epi16(a: i16x16, b: i16x16) -> i16x16 {
+pub unsafe fn _mm256_cmpgt_epi16(a: i16x16, b: i16x16) -> i16x16 {
    a.gt(b)
 }

@@ -219,7 +218,7 @@ pub fn _mm256_cmpgt_epi16(a: i16x16, b: i16x16) -> i16x16 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpcmpgtb))]
-pub fn _mm256_cmpgt_epi8(a: i8x32, b: i8x32) -> i8x32 {
+pub unsafe fn _mm256_cmpgt_epi8(a: i8x32, b: i8x32) -> i8x32 {
    a.gt(b)
 }

@@ -241,16 +240,16 @@ pub fn _mm256_cmpgt_epi8(a: i8x32, b: i8x32) -> i8x32 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vphaddw))]
-pub fn _mm256_hadd_epi16(a: i16x16, b: i16x16) -> i16x16 {
-    unsafe { phaddw(a, b) }
+pub unsafe fn _mm256_hadd_epi16(a: i16x16, b: i16x16) -> i16x16 {
+    phaddw(a, b)
 }

 /// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`.
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vphaddd))]
-pub fn _mm256_hadd_epi32(a: i32x8, b: i32x8) -> i32x8 {
-    unsafe { phaddd(a, b) }
+pub unsafe fn _mm256_hadd_epi32(a: i32x8, b: i32x8) -> i32x8 {
+    phaddd(a, b)
 }

 /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`
@@ -258,24 +257,24 @@ pub fn _mm256_hadd_epi32(a: i32x8, b: i32x8) -> i32x8 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vphaddsw))]
-pub fn _mm256_hadds_epi16(a: i16x16, b: i16x16) -> i16x16 {
-    unsafe { phaddsw(a, b) }
+pub unsafe fn _mm256_hadds_epi16(a: i16x16, b: i16x16) -> i16x16 {
+    phaddsw(a, b)
 }

 /// Horizontally substract adjacent pairs of 16-bit integers in `a` and `b`.
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vphsubw))]
-pub fn _mm256_hsub_epi16(a: i16x16, b: i16x16) -> i16x16 {
-    unsafe { phsubw(a, b) }
+pub unsafe fn _mm256_hsub_epi16(a: i16x16, b: i16x16) -> i16x16 {
+    phsubw(a, b)
 }

 /// Horizontally substract adjacent pairs of 32-bit integers in `a` and `b`.
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vphsubd))]
-pub fn _mm256_hsub_epi32(a: i32x8, b: i32x8) -> i32x8 {
-    unsafe { phsubd(a, b) }
+pub unsafe fn _mm256_hsub_epi32(a: i32x8, b: i32x8) -> i32x8 {
+    phsubd(a, b)
 }

 /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`
@@ -283,8 +282,8 @@ pub fn _mm256_hsub_epi32(a: i32x8, b: i32x8) -> i32x8 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vphsubsw))]
-pub fn _mm256_hsubs_epi16(a: i16x16, b: i16x16) -> i16x16 {
-    unsafe { phsubsw(a, b) }
+pub unsafe fn _mm256_hsubs_epi16(a: i16x16, b: i16x16) -> i16x16 {
+    phsubsw(a, b)
 }


@@ -328,8 +327,8 @@ pub fn _mm256_hsubs_epi16(a: i16x16, b: i16x16) -> i16x16 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpmaddwd))]
-pub fn _mm256_madd_epi16(a: i16x16, b: i16x16) -> i32x8 {
-    unsafe { pmaddwd(a, b) }
+pub unsafe fn _mm256_madd_epi16(a: i16x16, b: i16x16) -> i32x8 {
+    pmaddwd(a, b)
 }

 /// Vertically multiply each unsigned 8-bit integer from `a` with the
@@ -339,8 +338,8 @@ pub fn _mm256_madd_epi16(a: i16x16, b: i16x16) -> i32x8 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpmaddubsw))]
-pub fn _mm256_maddubs_epi16(a: u8x32, b: u8x32) -> i16x16 {
-    unsafe { pmaddubsw(a, b) }
+pub unsafe fn _mm256_maddubs_epi16(a: u8x32, b: u8x32) -> i16x16 {
+    pmaddubsw(a, b)
 }

 // TODO _mm_maskload_epi32 (int const* mem_addr, __m128i mask)
@@ -357,8 +356,8 @@ pub fn _mm256_maddubs_epi16(a: u8x32, b: u8x32) -> i16x16 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpmaxsw))]
-pub fn _mm256_max_epi16(a: i16x16, b: i16x16) -> i16x16 {
-    unsafe { pmaxsw(a, b) }
+pub unsafe fn _mm256_max_epi16(a: i16x16, b: i16x16) -> i16x16 {
+    pmaxsw(a, b)
 }

 /// Compare packed 32-bit integers in `a` and `b`, and return the packed
@@ -366,8 +365,8 @@ pub fn _mm256_max_epi16(a: i16x16, b: i16x16) -> i16x16 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpmaxsd))]
-pub fn _mm256_max_epi32(a: i32x8, b: i32x8) -> i32x8 {
-    unsafe { pmaxsd(a, b) }
+pub unsafe fn _mm256_max_epi32(a: i32x8, b: i32x8) -> i32x8 {
+    pmaxsd(a, b)
 }

 /// Compare packed 8-bit integers in `a` and `b`, and return the packed
@@ -375,8 +374,8 @@ pub fn _mm256_max_epi32(a: i32x8, b: i32x8) -> i32x8 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpmaxsb))]
-pub fn _mm256_max_epi8(a: i8x32, b: i8x32) -> i8x32 {
-    unsafe { pmaxsb(a, b) }
+pub unsafe fn _mm256_max_epi8(a: i8x32, b: i8x32) -> i8x32 {
+    pmaxsb(a, b)
 }

 /// Compare packed unsigned 16-bit integers in `a` and `b`, and return
@@ -384,8 +383,8 @@ pub fn _mm256_max_epi8(a: i8x32, b: i8x32) -> i8x32 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpmaxuw))]
-pub fn _mm256_max_epu16(a: u16x16, b: u16x16) -> u16x16 {
-    unsafe { pmaxuw(a, b) }
+pub unsafe fn _mm256_max_epu16(a: u16x16, b: u16x16) -> u16x16 {
+    pmaxuw(a, b)
 }

 /// Compare packed unsigned 32-bit integers in `a` and `b`, and return
@@ -393,8 +392,8 @@ pub fn _mm256_max_epu16(a: u16x16, b: u16x16) -> u16x16 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpmaxud))]
-pub fn _mm256_max_epu32(a: u32x8, b: u32x8) -> u32x8 {
-    unsafe { pmaxud(a, b) }
+pub unsafe fn _mm256_max_epu32(a: u32x8, b: u32x8) -> u32x8 {
+    pmaxud(a, b)
 }

 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return
@@ -402,8 +401,8 @@ pub fn _mm256_max_epu32(a: u32x8, b: u32x8) -> u32x8 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpmaxub))]
-pub fn _mm256_max_epu8(a: u8x32, b: u8x32) -> u8x32 {
-    unsafe { pmaxub(a, b) }
+pub unsafe fn _mm256_max_epu8(a: u8x32, b: u8x32) -> u8x32 {
+    pmaxub(a, b)
 }

 /// Compare packed 16-bit integers in `a` and `b`, and return the packed
@@ -411,8 +410,8 @@ pub fn _mm256_max_epu8(a: u8x32, b: u8x32) -> u8x32 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpminsw))]
-pub fn _mm256_min_epi16(a: i16x16, b: i16x16) -> i16x16 {
-    unsafe { pminsw(a, b) }
+pub unsafe fn _mm256_min_epi16(a: i16x16, b: i16x16) -> i16x16 {
+    pminsw(a, b)
 }

 /// Compare packed 32-bit integers in `a` and `b`, and return the packed
@@ -420,8 +419,8 @@ pub fn _mm256_min_epi16(a: i16x16, b: i16x16) -> i16x16 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpminsd))]
-pub fn _mm256_min_epi32(a: i32x8, b: i32x8) -> i32x8 {
-    unsafe { pminsd(a, b) }
+pub unsafe fn _mm256_min_epi32(a: i32x8, b: i32x8) -> i32x8 {
+    pminsd(a, b)
 }

 /// Compare packed 8-bit integers in `a` and `b`, and return the packed
@@ -429,8 +428,8 @@ pub fn _mm256_min_epi32(a: i32x8, b: i32x8) -> i32x8 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpminsb))]
-pub fn _mm256_min_epi8(a: i8x32, b: i8x32) -> i8x32 {
-    unsafe { pminsb(a, b) }
+pub unsafe fn _mm256_min_epi8(a: i8x32, b: i8x32) -> i8x32 {
+    pminsb(a, b)
 }

 /// Compare packed unsigned 16-bit integers in `a` and `b`, and return
@@ -438,8 +437,8 @@ pub fn _mm256_min_epi8(a: i8x32, b: i8x32) -> i8x32 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpminuw))]
-pub fn _mm256_min_epu16(a: u16x16, b: u16x16) -> u16x16 {
-    unsafe { pminuw(a, b) }
+pub unsafe fn _mm256_min_epu16(a: u16x16, b: u16x16) -> u16x16 {
+    pminuw(a, b)
 }

 /// Compare packed unsigned 32-bit integers in `a` and `b`, and return
@@ -447,8 +446,8 @@ pub fn _mm256_min_epu16(a: u16x16, b: u16x16) -> u16x16 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpminud))]
-pub fn _mm256_min_epu32(a: u32x8, b: u32x8) -> u32x8 {
-    unsafe { pminud(a, b) }
+pub unsafe fn _mm256_min_epu32(a: u32x8, b: u32x8) -> u32x8 {
+    pminud(a, b)
 }

 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return
@@ -456,8 +455,8 @@ pub fn _mm256_min_epu32(a: u32x8, b: u32x8) -> u32x8 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpminub))]
-pub fn _mm256_min_epu8(a: u8x32, b: u8x32) -> u8x32 {
-    unsafe { pminub(a, b) }
+pub unsafe fn _mm256_min_epu8(a: u8x32, b: u8x32) -> u8x32 {
+    pminub(a, b)
 }

 /*** The following two functions fail in debug, but work in release
@@ -492,8 +491,8 @@ pub fn _mm256_mpsadbw_epu8(a: u8x32, b: u8x32, imm8: i32) -> u16x16 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpmuldq))]
-pub fn _mm256_mul_epi32(a: i32x8, b: i32x8) -> i64x4 {
-    unsafe { pmuldq(a, b) }
+pub unsafe fn _mm256_mul_epi32(a: i32x8, b: i32x8) -> i64x4 {
+    pmuldq(a, b)
 }

 /// Multiply the low unsigned 32-bit integers from each packed 64-bit
@@ -503,8 +502,8 @@ pub fn _mm256_mul_epi32(a: i32x8, b: i32x8) -> i64x4 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpmuludq))]
-pub fn _mm256_mul_epu32(a: u32x8, b: u32x8) -> u64x4 {
-    unsafe { pmuludq(a, b) }
+pub unsafe fn _mm256_mul_epu32(a: u32x8, b: u32x8) -> u64x4 {
+    pmuludq(a, b)
 }

 /// Multiply the packed 16-bit integers in `a` and `b`, producing
@@ -513,8 +512,8 @@ pub fn _mm256_mul_epu32(a: u32x8, b: u32x8) -> u64x4 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpmulhw))]
-pub fn _mm256_mulhi_epi16(a: i16x16, b: i16x16) -> i16x16 {
-    unsafe { pmulhw(a, b) }
+pub unsafe fn _mm256_mulhi_epi16(a: i16x16, b: i16x16) -> i16x16 {
+    pmulhw(a, b)
 }

 /// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing
@@ -523,8 +522,8 @@ pub fn _mm256_mulhi_epi16(a: i16x16, b: i16x16) -> i16x16 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpmulhuw))]
-pub fn _mm256_mulhi_epu16(a: u16x16, b: u16x16) -> u16x16 {
-    unsafe { pmulhuw(a, b) }
+pub unsafe fn _mm256_mulhi_epu16(a: u16x16, b: u16x16) -> u16x16 {
+    pmulhuw(a, b)
 }

 /// Multiply the packed 16-bit integers in `a` and `b`, producing
@@ -533,7 +532,7 @@ pub fn _mm256_mulhi_epu16(a: u16x16, b: u16x16) -> u16x16 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpmullw))]
-pub fn _mm256_mullo_epi16(a: i16x16, b:i16x16) -> i16x16 {
+pub unsafe fn _mm256_mullo_epi16(a: i16x16, b:i16x16) -> i16x16 {
    a * b
 }

@@ -544,7 +543,7 @@ pub fn _mm256_mullo_epi16(a: i16x16, b:i16x16) -> i16x16 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpmulld))]
-pub fn _mm256_mullo_epi32(a: i32x8, b:i32x8) -> i32x8 {
+pub unsafe fn _mm256_mullo_epi32(a: i32x8, b:i32x8) -> i32x8 {
    a * b
 }

@@ -555,8 +554,8 @@ pub fn _mm256_mullo_epi32(a: i32x8, b:i32x8) -> i32x8 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpmulhrsw))]
-pub fn _mm256_mulhrs_epi16(a: i16x16, b:i16x16) -> i16x16 {
-    unsafe { pmulhrsw(a, b) }
+pub unsafe fn _mm256_mulhrs_epi16(a: i16x16, b:i16x16) -> i16x16 {
+    pmulhrsw(a, b)
 }

 /// Compute the bitwise OR of 256 bits (representing integer data) in `a`
@@ -564,7 +563,7 @@ pub fn _mm256_mulhrs_epi16(a: i16x16, b:i16x16) -> i16x16 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vorps))]
-pub fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
+pub unsafe fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
    a | b
 }

@@ -573,8 +572,8 @@ pub fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpacksswb))]
-pub fn _mm256_packs_epi16(a: i16x16, b: i16x16) -> i8x32 {
-    unsafe { packsswb(a, b) }
+pub unsafe fn _mm256_packs_epi16(a: i16x16, b: i16x16) -> i8x32 {
+    packsswb(a, b)
 }

 /// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers
@@ -582,8 +581,8 @@ pub fn _mm256_packs_epi16(a: i16x16, b: i16x16) -> i8x32 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpackssdw))]
-pub fn _mm256_packs_epi32(a: i32x8, b: i32x8) -> i16x16 {
-    unsafe { packssdw(a, b) }
+pub unsafe fn _mm256_packs_epi32(a: i32x8, b: i32x8) -> i16x16 {
+    packssdw(a, b)
 }

 /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers
@@ -591,8 +590,8 @@ pub fn _mm256_packs_epi32(a: i32x8, b: i32x8) -> i16x16 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpackuswb))]
-pub fn _mm256_packus_epi16(a: i16x16, b: i16x16) -> u8x32 {
-    unsafe { packuswb(a, b) }
+pub unsafe fn _mm256_packus_epi16(a: i16x16, b: i16x16) -> u8x32 {
+    packuswb(a, b)
 }

 /// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers
@@ -600,8 +599,8 @@ pub fn _mm256_packus_epi16(a: i16x16, b: i16x16) -> u8x32 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpackusdw))]
-pub fn _mm256_packus_epi32(a: i32x8, b: i32x8) -> u16x16 {
-    unsafe { packusdw(a, b) }
+pub unsafe fn _mm256_packus_epi32(a: i32x8, b: i32x8) -> u16x16 {
+    packusdw(a, b)
 }

 // TODO _mm256_permute2x128_si256 (__m256i a, __m256i b, const int imm8)
@@ -617,8 +616,8 @@ pub fn _mm256_packus_epi32(a: i32x8, b: i32x8) -> u16x16 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpsadbw))]
-pub fn _mm256_sad_epu8 (a: u8x32, b: u8x32) -> u64x4 {
-    unsafe { psadbw(a, b) }
+pub unsafe fn _mm256_sad_epu8 (a: u8x32, b: u8x32) -> u64x4 {
+    psadbw(a, b)
 }

 // TODO _mm256_shuffle_epi32 (__m256i a, const int imm8)
@@ -632,8 +631,8 @@ pub fn _mm256_sad_epu8 (a: u8x32, b: u8x32) -> u64x4 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpsignw))]
-pub fn _mm256_sign_epi16(a: i16x16, b: i16x16) -> i16x16 {
-    unsafe { psignw(a, b) }
+pub unsafe fn _mm256_sign_epi16(a: i16x16, b: i16x16) -> i16x16 {
+    psignw(a, b)
 }

 /// Negate packed 32-bit integers in `a` when the corresponding signed
@@ -642,8 +641,8 @@ pub fn _mm256_sign_epi16(a: i16x16, b: i16x16) -> i16x16 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpsignd))]
-pub fn _mm256_sign_epi32(a: i32x8, b: i32x8) -> i32x8 {
-    unsafe { psignd(a, b) }
+pub unsafe fn _mm256_sign_epi32(a: i32x8, b: i32x8) -> i32x8 {
+    psignd(a, b)
 }

 /// Negate packed 8-bit integers in `a` when the corresponding signed
@@ -652,8 +651,8 @@ pub fn _mm256_sign_epi32(a: i32x8, b: i32x8) -> i32x8 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpsignb))]
-pub fn _mm256_sign_epi8(a: i8x32, b: i8x32) -> i8x32 {
-    unsafe { psignb(a, b) }
+pub unsafe fn _mm256_sign_epi8(a: i8x32, b: i8x32) -> i8x32 {
+    psignb(a, b)
 }

 /// Shift packed 16-bit integers in `a` left by `count` while
@@ -661,8 +660,8 @@ pub fn _mm256_sign_epi8(a: i8x32, b: i8x32) -> i8x32 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpsllw))]
-pub fn _mm256_sll_epi16(a: i16x16, count: i16x8) -> i16x16 {
-    unsafe { psllw(a, count) }
+pub unsafe fn _mm256_sll_epi16(a: i16x16, count: i16x8) -> i16x16 {
+    psllw(a, count)
 }

 /// Shift packed 32-bit integers in `a` left by `count` while
@@ -670,8 +669,8 @@ pub fn _mm256_sll_epi16(a: i16x16, count: i16x8) -> i16x16 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpslld))]
-pub fn _mm256_sll_epi32(a: i32x8, count: i32x4) -> i32x8 {
-    unsafe { pslld(a, count) }
+pub unsafe fn _mm256_sll_epi32(a: i32x8, count: i32x4) -> i32x8 {
+    pslld(a, count)
 }

 /// Shift packed 64-bit integers in `a` left by `count` while
@@ -679,35 +678,35 @@ pub fn _mm256_sll_epi32(a: i32x8, count: i32x4) -> i32x8 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpsllq))]
-pub fn _mm256_sll_epi64(a: i64x4, count: i64x2) -> i64x4 {
-    unsafe { psllq(a, count) }
+pub unsafe fn _mm256_sll_epi64(a: i64x4, count: i64x2) -> i64x4 {
+    psllq(a, count)
 }

 /// Shift packed 16-bit integers in `a` left by `imm8` while
 /// shifting in zeros, return the results;
 #[inline(always)]
 #[target_feature = "+avx2"]
-#[cfg_attr(test, assert_instr(vpsllw))] // TODO: should this be pslli
-pub fn _mm256_slli_epi16(a: i16x16, imm8: i32) -> i16x16 {
-    unsafe { pslliw(a, imm8) }
+#[cfg_attr(test, assert_instr(vpsllw))]
+pub unsafe fn _mm256_slli_epi16(a: i16x16, imm8: i32) -> i16x16 {
+    pslliw(a, imm8)
 }

 /// Shift packed 32-bit integers in `a` left by `imm8` while
 /// shifting in zeros, return the results;
 #[inline(always)]
 #[target_feature = "+avx2"]
-#[cfg_attr(test, assert_instr(vpslld))] // TODO: should this be pslli
-pub fn _mm256_slli_epi32(a: i32x8, imm8: i32) -> i32x8 {
-    unsafe { psllid(a, imm8) }
+#[cfg_attr(test, assert_instr(vpslld))]
+pub unsafe fn _mm256_slli_epi32(a: i32x8, imm8: i32) -> i32x8 {
+    psllid(a, imm8)
 }

 /// Shift packed 64-bit integers in `a` left by `imm8` while
 /// shifting in zeros, return the results;
 #[inline(always)]
 #[target_feature = "+avx2"]
-#[cfg_attr(test, assert_instr(vpsllq))] // TODO: should this be pslli
-pub fn _mm256_slli_epi64(a: i64x4, imm8: i32) -> i64x4 {
-    unsafe { pslliq(a, imm8) }
+#[cfg_attr(test, assert_instr(vpsllq))]
+pub unsafe fn _mm256_slli_epi64(a: i64x4, imm8: i32) -> i64x4 {
+    pslliq(a, imm8)
 }

 // TODO _mm256_slli_si256 (__m256i a, const int imm8)
@@ -718,8 +717,8 @@ pub fn _mm256_slli_epi64(a: i64x4, imm8: i32) -> i64x4 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpsllvd))]
-pub fn _mm_sllv_epi32(a: i32x4, count: i32x4) -> i32x4 {
-    unsafe { psllvd(a, count) }
+pub unsafe fn _mm_sllv_epi32(a: i32x4, count: i32x4) -> i32x4 {
+    psllvd(a, count)
 }

 /// Shift packed 32-bit integers in `a` left by the amount
@@ -728,8 +727,8 @@ pub fn _mm_sllv_epi32(a: i32x4, count: i32x4) -> i32x4 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpsllvd))]
-pub fn _mm256_sllv_epi32(a: i32x8, count: i32x8) -> i32x8 {
-    unsafe { psllvd256(a, count) }
+pub unsafe fn _mm256_sllv_epi32(a: i32x8, count: i32x8) -> i32x8 {
+    psllvd256(a, count)
 }

 /// Shift packed 64-bit integers in `a` left by the amount
@@ -738,8 +737,8 @@ pub fn _mm256_sllv_epi32(a: i32x8, count: i32x8) -> i32x8 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpsllvq))]
-pub fn _mm_sllv_epi64(a: i64x2, count: i64x2) -> i64x2 {
-    unsafe { psllvq(a, count) }
+pub unsafe fn _mm_sllv_epi64(a: i64x2, count: i64x2) -> i64x2 {
+    psllvq(a, count)
 }

 /// Shift packed 64-bit integers in `a` left by the amount
@@ -748,8 +747,8 @@ pub fn _mm_sllv_epi64(a: i64x2, count: i64x2) -> i64x2 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpsllvq))]
-pub fn _mm256_sllv_epi64(a: i64x4, count: i64x4) -> i64x4 {
-    unsafe { psllvq256(a, count) }
+pub unsafe fn _mm256_sllv_epi64(a: i64x4, count: i64x4) -> i64x4 {
+    psllvq256(a, count)
 }

 /// Shift packed 16-bit integers in `a` right by `count` while
@@ -757,8 +756,8 @@ pub fn _mm256_sllv_epi64(a: i64x4, count: i64x4) -> i64x4 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpsraw))]
-pub fn _mm256_sra_epi16(a: i16x16, count: i16x8) -> i16x16 {
-    unsafe { psraw(a, count) }
+pub unsafe fn _mm256_sra_epi16(a: i16x16, count: i16x8) -> i16x16 {
+    psraw(a, count)
 }

 /// Shift packed 32-bit integers in `a` right by `count` while
@@ -766,26 +765,26 @@ pub fn _mm256_sra_epi16(a: i16x16, count: i16x8) -> i16x16 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpsrad))]
-pub fn _mm256_sra_epi32(a: i32x8, count: i32x4) -> i32x8 {
-    unsafe { psrad(a, count) }
+pub unsafe fn _mm256_sra_epi32(a: i32x8, count: i32x4) -> i32x8 {
+    psrad(a, count)
 }

 /// Shift packed 16-bit integers in `a` right by `imm8` while
 /// shifting in sign bits.
 #[inline(always)]
 #[target_feature = "+avx2"]
-#[cfg_attr(test, assert_instr(vpsraw))] // TODO: notvpsraiw?
-pub fn _mm256_srai_epi16(a: i16x16, imm8: i32) -> i16x16 {
-    unsafe { psraiw(a, imm8) }
+#[cfg_attr(test, assert_instr(vpsraw))]
+pub unsafe fn _mm256_srai_epi16(a: i16x16, imm8: i32) -> i16x16 {
+    psraiw(a, imm8)
 }

 /// Shift packed 32-bit integers in `a` right by `imm8` while
 /// shifting in sign bits.
 #[inline(always)]
 #[target_feature = "+avx2"]
-#[cfg_attr(test, assert_instr(vpsrad))] // TODO: not vpsraid?
-pub fn _mm256_srai_epi32(a: i32x8, imm8: i32) -> i32x8 {
-    unsafe { psraid(a, imm8) }
+#[cfg_attr(test, assert_instr(vpsrad))]
+pub unsafe fn _mm256_srai_epi32(a: i32x8, imm8: i32) -> i32x8 {
+    psraid(a, imm8)
 }

 /// Shift packed 32-bit integers in `a` right by the amount specified by the
@@ -793,8 +792,8 @@ pub fn _mm256_srai_epi32(a: i32x8, imm8: i32) -> i32x8 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpsravd))]
-pub fn _mm_srav_epi32(a: i32x4, count: i32x4) -> i32x4 {
-    unsafe { psravd(a, count) }
+pub unsafe fn _mm_srav_epi32(a: i32x4, count: i32x4) -> i32x4 {
+    psravd(a, count)
 }

 /// Shift packed 32-bit integers in `a` right by the amount specified by the
@@ -802,8 +801,8 @@ pub fn _mm_srav_epi32(a: i32x4, count: i32x4) -> i32x4 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpsravd))]
-pub fn _mm256_srav_epi32(a: i32x8, count: i32x8) -> i32x8 {
-    unsafe { psravd256(a, count) }
+pub unsafe fn _mm256_srav_epi32(a: i32x8, count: i32x8) -> i32x8 {
+    psravd256(a, count)
 }


@@ -812,8 +811,8 @@ pub fn _mm256_srav_epi32(a: i32x8, count: i32x8) -> i32x8 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpsrlw))]
-pub fn _mm256_srl_epi16(a: i16x16, count: i16x8) -> i16x16 {
-    unsafe { psrlw(a, count) }
+pub unsafe fn _mm256_srl_epi16(a: i16x16, count: i16x8) -> i16x16 {
+    psrlw(a, count)
 }

 /// Shift packed 32-bit integers in `a` right by `count` while shifting in
@@ -821,8 +820,8 @@ pub fn _mm256_srl_epi16(a: i16x16, count: i16x8) -> i16x16 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpsrld))]
-pub fn _mm256_srl_epi32(a: i32x8, count: i32x4) -> i32x8 {
-    unsafe { psrld(a, count) }
+pub unsafe fn _mm256_srl_epi32(a: i32x8, count: i32x4) -> i32x8 {
+    psrld(a, count)
 }

 /// Shift packed 64-bit integers in `a` right by `count` while shifting in
@@ -830,35 +829,35 @@ pub fn _mm256_srl_epi32(a: i32x8, count: i32x4) -> i32x8 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpsrlq))]
-pub fn _mm256_srl_epi64(a: i64x4, count: i64x2) -> i64x4 {
-    unsafe { psrlq(a, count) }
+pub unsafe fn _mm256_srl_epi64(a: i64x4, count: i64x2) -> i64x4 {
+    psrlq(a, count)
 }

 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in
 /// zeros
 #[inline(always)]
 #[target_feature = "+avx2"]
-#[cfg_attr(test, assert_instr(vpsrlw))] // TODO not vpsrliw?
-pub fn _mm256_srli_epi16(a: i16x16, imm8: i32) -> i16x16 {
-    unsafe { psrliw(a, imm8) }
+#[cfg_attr(test, assert_instr(vpsrlw))]
+pub unsafe fn _mm256_srli_epi16(a: i16x16, imm8: i32) -> i16x16 {
+    psrliw(a, imm8)
 }

 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in
 /// zeros
 #[inline(always)]
 #[target_feature = "+avx2"]
-#[cfg_attr(test, assert_instr(vpsrld))] // TODO: not vpsrlid?
-pub fn _mm256_srli_epi32(a: i32x8, imm8: i32) -> i32x8 {
-    unsafe { psrlid(a, imm8) }
+#[cfg_attr(test, assert_instr(vpsrld))]
+pub unsafe fn _mm256_srli_epi32(a: i32x8, imm8: i32) -> i32x8 {
+    psrlid(a, imm8)
 }

 /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in
 /// zeros
 #[inline(always)]
 #[target_feature = "+avx2"]
-#[cfg_attr(test, assert_instr(vpsrlq))] // TODO: not vpsrliq?
-pub fn _mm256_srli_epi64(a: i64x4, imm8: i32) -> i64x4 {
-    unsafe { psrliq(a, imm8) }
+#[cfg_attr(test, assert_instr(vpsrlq))]
+pub unsafe fn _mm256_srli_epi64(a: i64x4, imm8: i32) -> i64x4 {
+    psrliq(a, imm8)
 }

 /// Shift packed 32-bit integers in `a` right by the amount specified by
@@ -866,8 +865,8 @@ pub fn _mm256_srli_epi64(a: i64x4, imm8: i32) -> i64x4 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpsrlvd))]
-pub fn _mm_srlv_epi32(a: i32x4, count: i32x4) -> i32x4 {
-    unsafe { psrlvd(a, count) }
+pub unsafe fn _mm_srlv_epi32(a: i32x4, count: i32x4) -> i32x4 {
+    psrlvd(a, count)
 }

 /// Shift packed 32-bit integers in `a` right by the amount specified by
@@ -875,8 +874,8 @@ pub fn _mm_srlv_epi32(a: i32x4, count: i32x4) -> i32x4 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpsrlvd))]
-pub fn _mm256_srlv_epi32(a: i32x8, count: i32x8) -> i32x8 {
-    unsafe { psrlvd256(a, count) }
+pub unsafe fn _mm256_srlv_epi32(a: i32x8, count: i32x8) -> i32x8 {
+    psrlvd256(a, count)
 }

 /// Shift packed 64-bit integers in `a` right by the amount specified by
@@ -884,8 +883,8 @@ pub fn _mm256_srlv_epi32(a: i32x8, count: i32x8) -> i32x8 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpsrlvq))]
-pub fn _mm_srlv_epi64(a: i64x2, count: i64x2) -> i64x2 {
-    unsafe { psrlvq(a, count) }
+pub unsafe fn _mm_srlv_epi64(a: i64x2, count: i64x2) -> i64x2 {
+    psrlvq(a, count)
 }

 /// Shift packed 64-bit integers in `a` right by the amount specified by
@@ -893,8 +892,8 @@ pub fn _mm_srlv_epi64(a: i64x2, count: i64x2) -> i64x2 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpsrlvq))]
-pub fn _mm256_srlv_epi64(a: i64x4, count: i64x4) -> i64x4 {
-    unsafe { psrlvq256(a, count) }
+pub unsafe fn _mm256_srlv_epi64(a: i64x4, count: i64x4) -> i64x4 {
+    psrlvq256(a, count)
 }

 // TODO _mm256_stream_load_si256 (__m256i const* mem_addr)
@@ -903,7 +902,7 @@ pub fn _mm256_srlv_epi64(a: i64x4, count: i64x4) -> i64x4 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpsubw))]
-pub fn _mm256_sub_epi16(a: i16x16, b: i16x16) -> i16x16 {
+pub unsafe fn _mm256_sub_epi16(a: i16x16, b: i16x16) -> i16x16 {
    a - b
 }

@@ -911,7 +910,7 @@ pub fn _mm256_sub_epi16(a: i16x16, b: i16x16) -> i16x16 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpsubd))]
-pub fn _mm256_sub_epi32(a: i32x8, b: i32x8) -> i32x8 {
+pub unsafe fn _mm256_sub_epi32(a: i32x8, b: i32x8) -> i32x8 {
    a - b
 }

@@ -919,7 +918,7 @@ pub fn _mm256_sub_epi32(a: i32x8, b: i32x8) -> i32x8 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpsubq))]
-pub fn _mm256_sub_epi64(a: i64x4, b: i64x4) -> i64x4 {
+pub unsafe fn _mm256_sub_epi64(a: i64x4, b: i64x4) -> i64x4 {
    a - b
 }

@@ -927,7 +926,7 @@ pub fn _mm256_sub_epi64(a: i64x4, b: i64x4) -> i64x4 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpsubb))]
-pub fn _mm256_sub_epi8(a: i8x32, b: i8x32) -> i8x32 {
+pub unsafe fn _mm256_sub_epi8(a: i8x32, b: i8x32) -> i8x32 {
    a - b
 }

@@ -936,8 +935,8 @@ pub fn _mm256_sub_epi8(a: i8x32, b: i8x32) -> i8x32 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpsubsw))]
-pub fn _mm256_subs_epi16(a: i16x16, b: i16x16) -> i16x16 {
-    unsafe { psubsw(a, b) }
+pub unsafe fn _mm256_subs_epi16(a: i16x16, b: i16x16) -> i16x16 {
+    psubsw(a, b)
 }

 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in
@@ -945,8 +944,8 @@ pub fn _mm256_subs_epi16(a: i16x16, b: i16x16) -> i16x16 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpsubsb))]
-pub fn _mm256_subs_epi8(a: i8x32, b: i8x32) -> i8x32 {
-    unsafe { psubsb(a, b) }
+pub unsafe fn _mm256_subs_epi8(a: i8x32, b: i8x32) -> i8x32 {
+    psubsb(a, b)
 }

 /// Subtract packed unsigned 16-bit integers in `b` from packed 16-bit
@@ -954,8 +953,8 @@ pub fn _mm256_subs_epi8(a: i8x32, b: i8x32) -> i8x32 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpsubusw))]
-pub fn _mm256_subs_epu16(a: u16x16, b: u16x16) -> u16x16 {
-    unsafe { psubusw(a, b) }
+pub unsafe fn _mm256_subs_epu16(a: u16x16, b: u16x16) -> u16x16 {
+    psubusw(a, b)
 }

 /// Subtract packed unsigned 8-bit integers in `b` from packed 8-bit
@@ -963,8 +962,8 @@ pub fn _mm256_subs_epu16(a: u16x16, b: u16x16) -> u16x16 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpsubusb))]
-pub fn _mm256_subs_epu8(a: u8x32, b: u8x32) -> u8x32 {
-    unsafe { psubusb(a, b) }
+pub unsafe fn _mm256_subs_epu8(a: u8x32, b: u8x32) -> u8x32 {
+    psubusb(a, b)
 }

 // TODO __mm256_unpackhi_epi16 (__m256i a, __m256i b)
@@ -981,11 +980,10 @@ pub fn _mm256_subs_epu8(a: u8x32, b: u8x32) -> u8x32 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vxorps))]
-pub fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
+pub unsafe fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
    a ^ b
 }

-
 #[allow(improper_ctypes)]
 extern "C" {
    #[link_name = "llvm.x86.avx2.pabs.b"]
@@ -1048,9 +1046,9 @@ pub fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
    fn pminud(a: u32x8, b: u32x8) -> u32x8;
    #[link_name = "llvm.x86.avx2.pminu.b"]
    fn pminub(a: u8x32, b: u8x32) -> u8x32;
-    #[link_name = "llvm.x86.avx2.pmovmskb"]  //fails in debug
+    #[link_name = "llvm.x86.avx2.pmovmskb"]
    fn pmovmskb(a: i8x32) -> i32;
-    #[link_name = "llvm.x86.avx2.mpsadbw"] //fails in debug
+    #[link_name = "llvm.x86.avx2.mpsadbw"]
    fn mpsadbw(a: u8x32, b: u8x32, imm8: i32) -> u16x16;
    #[link_name = "llvm.x86.avx2.pmulhu.w"]
    fn pmulhuw(a: u16x16, b: u16x16) -> u16x16;
@@ -1141,7 +1139,6 @@ pub fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {

 }

-
 #[cfg(test)]
 mod tests {
    use stdsimd_test::simd_test;
@@ -1157,7 +1154,7 @@ fn _mm256_abs_epi32() {
        let a = i32x8::new(
            0, 1, -1, std::i32::MAX,
            std::i32::MIN + 1, 100, -100, -32);
-        let r = avx2::_mm256_abs_epi32(a);
+        let r = unsafe { avx2::_mm256_abs_epi32(a) };
        let e = i32x8::new(
            0, 1, 1, std::i32::MAX,
            (std::i32::MIN + 1).abs(), 100, 100, 32);
@@ -1171,7 +1168,7 @@ fn _mm256_abs_epi16() {
            -2, 3, -3, 4,
            -4, 5, -5, std::i16::MAX,
            std::i16::MIN + 1, 100, -100, -32);
-        let r = avx2::_mm256_abs_epi16(a);
+        let r = unsafe { avx2::_mm256_abs_epi16(a) };
        let e = i16x16::new(
            0, 1, 1, 2,
            2, 3, 3, 4,
@@ -1191,7 +1188,7 @@ fn _mm256_abs_epi8() {
            -2, 3, -3, 4,
            -4, 5, -5, std::i8::MAX,
            std::i8::MIN + 1, 100, -100, -32);
-        let r = avx2::_mm256_abs_epi8(a);
+        let r = unsafe { avx2::_mm256_abs_epi8(a) };
        let e = i8x32::new(
            0, 1, 1, 2, 2, 3, 3, 4,
            4, 5, 5, std::i8::MAX, (std::i8::MIN + 1).abs(), 100, 100, 32,
@@ -1204,7 +1201,7 @@ fn _mm256_abs_epi8() {
    fn _mm256_add_epi64() {
        let a = i64x4::new(-10, 0, 100, 1_000_000_000);
        let b = i64x4::new(-1, 0, 1, 2);
-        let r = avx2::_mm256_add_epi64(a, b);
+        let r = unsafe { avx2::_mm256_add_epi64(a, b) };
        let e = i64x4::new(-11, 0, 101, 1_000_000_002);
        assert_eq!(r, e);
    }
@@ -1213,7 +1210,7 @@ fn _mm256_add_epi64() {
    fn _mm256_add_epi32() {
        let a = i32x8::new(-1, 0, 1, 2, 3, 4, 5, 6);
        let b = i32x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = avx2::_mm256_add_epi32(a, b);
+        let r = unsafe { avx2::_mm256_add_epi32(a, b) };
        let e = i32x8::new(0, 2, 4, 6, 8, 10, 12, 14);
        assert_eq!(r, e);
    }
@@ -1226,7 +1223,7 @@ fn _mm256_add_epi16() {
        let b = i16x16::new(
            0, 1, 2, 3, 4, 5, 6, 7,
            8, 9, 10, 11, 12, 13, 14, 15);
-        let r = avx2::_mm256_add_epi16(a, b);
+        let r = unsafe { avx2::_mm256_add_epi16(a, b) };
        let e = i16x16::new(
            0, 2, 4, 6, 8, 10, 12, 14,
            16, 18, 20, 22, 24, 26, 28, 30);
@@ -1245,7 +1242,7 @@ fn _mm256_add_epi8() {
            8, 9, 10, 11, 12, 13, 14, 15,
            16, 17, 18, 19, 20, 21, 22, 23,
            24, 25, 26, 27, 28, 29, 30, 31);
-        let r = avx2::_mm256_add_epi8(a, b);
+        let r = unsafe { avx2::_mm256_add_epi8(a, b) };
        let e = i8x32::new(
            0, 2, 4, 6, 8, 10, 12, 14, 16,
            18, 20, 22, 24, 26, 28, 30, 32,
@@ -1262,7 +1259,7 @@ fn _mm256_adds_epi8() {
        let b = i8x32::new(
            32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
            48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
-        let r = avx2::_mm256_adds_epi8(a, b);
+        let r = unsafe { avx2::_mm256_adds_epi8(a, b) };
        let e = i8x32::new(
            32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
            64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94);
@@ -1273,7 +1270,7 @@ fn _mm256_adds_epi8() {
    fn _mm256_adds_epi8_saturate_positive() {
        let a = i8x32::splat(0x7F);
        let b = i8x32::splat(1);
-        let r = avx2::_mm256_adds_epi8(a, b);
+        let r = unsafe { avx2::_mm256_adds_epi8(a, b) };
        assert_eq!(r, a);
    }

@@ -1281,7 +1278,7 @@ fn _mm256_adds_epi8_saturate_positive() {
    fn _mm256_adds_epi8_saturate_negative() {
        let a = i8x32::splat(-0x80);
        let b = i8x32::splat(-1);
-        let r = avx2::_mm256_adds_epi8(a, b);
+        let r = unsafe { avx2::_mm256_adds_epi8(a, b) };
        assert_eq!(r, a);
    }

@@ -1291,7 +1288,7 @@ fn _mm256_adds_epi16() {
            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let b = i16x16::new(
            32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47);
-        let r = avx2::_mm256_adds_epi16(a,  b);
+        let r = unsafe { avx2::_mm256_adds_epi16(a,  b) };
        let e = i16x16::new(
            32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62);

@@ -1302,7 +1299,7 @@ fn _mm256_adds_epi16() {
    fn _mm256_adds_epi16_saturate_positive() {
        let a = i16x16::splat(0x7FFF);
        let b = i16x16::splat(1);
-        let r = avx2::_mm256_adds_epi16(a, b);
+        let r = unsafe { avx2::_mm256_adds_epi16(a, b) };
        assert_eq!(r, a);
    }

@@ -1310,7 +1307,7 @@ fn _mm256_adds_epi16_saturate_positive() {
    fn _mm256_adds_epi16_saturate_negative() {
        let a = i16x16::splat(-0x8000);
        let b = i16x16::splat(-1);
-        let r = avx2::_mm256_adds_epi16(a, b);
+        let r = unsafe { avx2::_mm256_adds_epi16(a, b) };
        assert_eq!(r, a);
    }

@@ -1322,7 +1319,7 @@ fn _mm256_adds_epu8() {
        let b = u8x32::new(
            32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
            48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
-        let r = avx2::_mm256_adds_epu8(a, b);
+        let r = unsafe { avx2::_mm256_adds_epu8(a, b) };
        let e = u8x32::new(
            32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
            64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94);
@@ -1333,7 +1330,7 @@ fn _mm256_adds_epu8() {
    fn _mm256_adds_epu8_saturate() {
        let a = u8x32::splat(0xFF);
        let b = u8x32::splat(1);
-        let r = avx2::_mm256_adds_epu8(a, b);
+        let r = unsafe { avx2::_mm256_adds_epu8(a, b) };
        assert_eq!(r, a);
    }

@@ -1344,7 +1341,7 @@ fn _mm256_adds_epu16() {
            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let b = u16x16::new(
            32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47);
-        let r = avx2::_mm256_adds_epu16(a, b);
+        let r = unsafe { avx2::_mm256_adds_epu16(a, b) };
        let e = u16x16::new(
            32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62);

@@ -1355,35 +1352,37 @@ fn _mm256_adds_epu16() {
    fn _mm256_adds_epu16_saturate() {
        let a = u16x16::splat(0xFFFF);
        let b = u16x16::splat(1);
-        let r = avx2::_mm256_adds_epu16(a, b);
+        let r = unsafe { avx2::_mm256_adds_epu16(a, b) };
        assert_eq!(r, a);
    }

    #[simd_test = "avx2"]
    fn _mm256_and_si256() {
-        assert_eq!(
-            avx2::_mm256_and_si256(
-                __m256i::splat(5), __m256i::splat(3)),__m256i::splat(1));
+        let got = unsafe {
+            avx2::_mm256_and_si256(__m256i::splat(5), __m256i::splat(3))
+        };
+        assert_eq!(got, __m256i::splat(1));
    }

    #[simd_test = "avx2"]
    fn _mm256_andnot_si256() {
-        assert_eq!(
-            avx2::_mm256_andnot_si256(__m256i::splat(5), __m256i::splat(3)),
-            __m256i::splat(2));
+        let got = unsafe {
+            avx2::_mm256_andnot_si256(__m256i::splat(5), __m256i::splat(3))
+        };
+        assert_eq!(got, __m256i::splat(2));
    }

    #[simd_test = "avx2"]
    fn _mm256_avg_epu8() {
        let (a, b) = (u8x32::splat(3), u8x32::splat(9));
-        let r = avx2::_mm256_avg_epu8(a, b);
+        let r = unsafe { avx2::_mm256_avg_epu8(a, b) };
        assert_eq!(r, u8x32::splat(6));
    }

    #[simd_test = "avx2"]
    fn _mm256_avg_epu16() {
        let (a, b) = (u16x16::splat(3), u16x16::splat(9));
-        let r = avx2::_mm256_avg_epu16(a, b);
+        let r = unsafe { avx2::_mm256_avg_epu16(a, b) };
        assert_eq!(r, u16x16::splat(6));
    }

@@ -1392,7 +1391,7 @@ fn _mm256_blendv_epi8() {
        let (a,b) = (i8x32::splat(4),i8x32::splat(2));
        let mask = i8x32::splat(0).replace(2,-1);
        let e = i8x32::splat(4).replace(2,2);
-        let r= avx2::_mm256_blendv_epi8(a,b,mask);
+        let r= unsafe { avx2::_mm256_blendv_epi8(a,b,mask) };
        assert_eq!(r,e);
    }

@@ -1404,7 +1403,7 @@ fn _mm256_cmpeq_epi8() {
        let b = i8x32::new(
            31, 30, 2, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
            15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = avx2::_mm256_cmpeq_epi8(a, b);
+        let r = unsafe { avx2::_mm256_cmpeq_epi8(a, b) };
        assert_eq!(r, i8x32::splat(0).replace(2,0xFFu8 as i8));
    }

@@ -1414,7 +1413,7 @@ fn _mm256_cmpeq_epi16() {
            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let b = i16x16::new(
            15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = avx2::_mm256_cmpeq_epi16(a, b);
+        let r = unsafe { avx2::_mm256_cmpeq_epi16(a, b) };
        assert_eq!(r, i16x16::splat(0).replace(2, 0xFFFFu16 as i16));
    }

@@ -1422,7 +1421,7 @@ fn _mm256_cmpeq_epi16() {
    fn _mm256_cmpeq_epi32() {
        let a = i32x8::new(0, 1, 2, 3,4,5,6,7);
        let b = i32x8::new(7,6,2,4,3, 2, 1, 0);
-        let r = avx2::_mm256_cmpeq_epi32(a, b);
+        let r = unsafe { avx2::_mm256_cmpeq_epi32(a, b) };
        assert_eq!(r, i32x8::splat(0).replace(2, 0xFFFFFFFFu32 as i32));
    }

@@ -1430,7 +1429,7 @@ fn _mm256_cmpeq_epi32() {
    fn _mm256_cmpeq_epi64() {
        let a = i64x4::new(0, 1, 2, 3);
        let b = i64x4::new(3, 2, 2, 0);
-        let r = avx2::_mm256_cmpeq_epi64(a, b);
+        let r = unsafe { avx2::_mm256_cmpeq_epi64(a, b) };
        assert_eq!(r, i64x4::splat(0).replace(
            2, 0xFFFFFFFFFFFFFFFFu64 as i64));
    }
@@ -1439,7 +1438,7 @@ fn _mm256_cmpeq_epi64() {
    fn _mm256_cmpgt_epi8() {
        let a = i8x32::splat(0).replace(0, 5);
        let b = i8x32::splat(0);
-        let r = avx2::_mm256_cmpgt_epi8(a, b);
+        let r = unsafe { avx2::_mm256_cmpgt_epi8(a, b) };
        assert_eq!(r, i8x32::splat(0).replace(0, 0xFFu8 as i8));
    }

@@ -1447,7 +1446,7 @@ fn _mm256_cmpgt_epi8() {
    fn _mm256_cmpgt_epi16() {
        let a = i16x16::splat(0).replace(0, 5);
        let b = i16x16::splat(0);
-        let r = avx2::_mm256_cmpgt_epi16(a, b);
+        let r = unsafe { avx2::_mm256_cmpgt_epi16(a, b) };
        assert_eq!(r, i16x16::splat(0).replace(0, 0xFFFFu16 as i16));
    }

@@ -1455,7 +1454,7 @@ fn _mm256_cmpgt_epi16() {
    fn _mm256_cmpgt_epi32() {
        let a = i32x8::splat(0).replace(0, 5);
        let b = i32x8::splat(0);
-        let r = avx2::_mm256_cmpgt_epi32(a, b);
+        let r = unsafe { avx2::_mm256_cmpgt_epi32(a, b) };
        assert_eq!(r, i32x8::splat(0).replace(0, 0xFFFFFFFFu32 as i32));
    }

@@ -1463,7 +1462,7 @@ fn _mm256_cmpgt_epi32() {
    fn _mm256_cmpgt_epi64() {
        let a = i64x4::splat(0).replace(0, 5);
        let b = i64x4::splat(0);
-        let r = avx2::_mm256_cmpgt_epi64(a, b);
+        let r = unsafe { avx2::_mm256_cmpgt_epi64(a, b) };
        assert_eq!(r, i64x4::splat(0).replace(
            0, 0xFFFFFFFFFFFFFFFFu64 as i64));
    }
@@ -1472,7 +1471,7 @@ fn _mm256_cmpgt_epi64() {
    fn _mm256_hadd_epi16() {
        let a = i16x16::splat(2);
        let b = i16x16::splat(4);
-        let r = avx2::_mm256_hadd_epi16(a, b);
+        let r = unsafe { avx2::_mm256_hadd_epi16(a, b) };
        let e = i16x16::new(4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8);
        assert_eq!(r, e);
    }
@@ -1481,7 +1480,7 @@ fn _mm256_hadd_epi16() {
    fn _mm256_hadd_epi32() {
        let a = i32x8::splat(2);
        let b = i32x8::splat(4);
-        let r = avx2::_mm256_hadd_epi32(a, b);
+        let r = unsafe { avx2::_mm256_hadd_epi32(a, b) };
        let e = i32x8::new(4, 4, 8, 8, 4, 4, 8, 8);
        assert_eq!(r, e);
    }
@@ -1490,7 +1489,7 @@ fn _mm256_hadd_epi32() {
    fn _mm256_hadds_epi16() {
        let a = i16x16::splat(2).replace(0,0x7FFF).replace(1,1);
        let b = i16x16::splat(4);
-        let r = avx2::_mm256_hadds_epi16(a, b);
+        let r = unsafe { avx2::_mm256_hadds_epi16(a, b) };
        let e = i16x16::new(
            0x7FFF, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8);
        assert_eq!(r, e);
@@ -1500,7 +1499,7 @@ fn _mm256_hadds_epi16() {
    fn _mm256_hsub_epi16() {
        let a = i16x16::splat(2);
        let b = i16x16::splat(4);
-        let r = avx2::_mm256_hsub_epi16(a, b);
+        let r = unsafe { avx2::_mm256_hsub_epi16(a, b) };
        let e = i16x16::splat(0);
        assert_eq!(r, e);
    }
@@ -1509,7 +1508,7 @@ fn _mm256_hsub_epi16() {
    fn _mm256_hsub_epi32() {
        let a = i32x8::splat(2);
        let b = i32x8::splat(4);
-        let r = avx2::_mm256_hsub_epi32(a, b);
+        let r = unsafe { avx2::_mm256_hsub_epi32(a, b) };
        let e = i32x8::splat(0);
        assert_eq!(r, e);
    }
@@ -1518,7 +1517,7 @@ fn _mm256_hsub_epi32() {
    fn _mm256_hsubs_epi16() {
        let a = i16x16::splat(2).replace(0,0x7FFF).replace(1,-1);
        let b = i16x16::splat(4);
-        let r = avx2::_mm256_hsubs_epi16(a, b);
+        let r = unsafe { avx2::_mm256_hsubs_epi16(a, b) };
        let e = i16x16::splat(0).replace(0,0x7FFF);
        assert_eq!(r, e);
    }
@@ -1527,7 +1526,7 @@ fn _mm256_hsubs_epi16() {
    fn _mm256_madd_epi16() {
        let a = i16x16::splat(2);
        let b = i16x16::splat(4);
-        let r = avx2::_mm256_madd_epi16(a, b);
+        let r = unsafe { avx2::_mm256_madd_epi16(a, b) };
        let e = i32x8::splat(16);
        assert_eq!(r, e);
    }
@@ -1536,7 +1535,7 @@ fn _mm256_madd_epi16() {
    fn _mm256_maddubs_epi16() {
        let a = u8x32::splat(2);
        let b = u8x32::splat(4);
-        let r = avx2::_mm256_maddubs_epi16(a, b);
+        let r = unsafe { avx2::_mm256_maddubs_epi16(a, b) };
        let e = i16x16::splat(16);
        assert_eq!(r, e);
    }
@@ -1545,7 +1544,7 @@ fn _mm256_maddubs_epi16() {
    fn _mm256_max_epi16() {
        let a = i16x16::splat(2);
        let b = i16x16::splat(4);
-        let r = avx2::_mm256_max_epi16(a, b);
+        let r = unsafe { avx2::_mm256_max_epi16(a, b) };
        assert_eq!(r, b);
    }

@@ -1553,7 +1552,7 @@ fn _mm256_max_epi16() {
    fn _mm256_max_epi32() {
        let a = i32x8::splat(2);
        let b = i32x8::splat(4);
-        let r = avx2::_mm256_max_epi32(a, b);
+        let r = unsafe { avx2::_mm256_max_epi32(a, b) };
        assert_eq!(r, b);
    }

@@ -1561,7 +1560,7 @@ fn _mm256_max_epi32() {
    fn _mm256_max_epi8() {
        let a = i8x32::splat(2);
        let b = i8x32::splat(4);
-        let r = avx2::_mm256_max_epi8(a, b);
+        let r = unsafe { avx2::_mm256_max_epi8(a, b) };
        assert_eq!(r, b);
    }

@@ -1569,7 +1568,7 @@ fn _mm256_max_epi8() {
    fn _mm256_max_epu16() {
        let a = u16x16::splat(2);
        let b = u16x16::splat(4);
-        let r = avx2::_mm256_max_epu16(a, b);
+        let r = unsafe { avx2::_mm256_max_epu16(a, b) };
        assert_eq!(r, b);
    }

@@ -1577,7 +1576,7 @@ fn _mm256_max_epu16() {
    fn _mm256_max_epu32() {
        let a = u32x8::splat(2);
        let b = u32x8::splat(4);
-        let r = avx2::_mm256_max_epu32(a, b);
+        let r = unsafe { avx2::_mm256_max_epu32(a, b) };
        assert_eq!(r, b);
    }

@@ -1585,7 +1584,7 @@ fn _mm256_max_epu32() {
    fn _mm256_max_epu8() {
        let a = u8x32::splat(2);
        let b = u8x32::splat(4);
-        let r = avx2::_mm256_max_epu8(a, b);
+        let r = unsafe { avx2::_mm256_max_epu8(a, b) };
        assert_eq!(r, b);
    }

@@ -1593,7 +1592,7 @@ fn _mm256_max_epu8() {
    fn _mm256_min_epi16() {
        let a = i16x16::splat(2);
        let b = i16x16::splat(4);
-        let r = avx2::_mm256_min_epi16(a, b);
+        let r = unsafe { avx2::_mm256_min_epi16(a, b) };
        assert_eq!(r, a);
    }

@@ -1601,7 +1600,7 @@ fn _mm256_min_epi16() {
    fn _mm256_min_epi32() {
        let a = i32x8::splat(2);
        let b = i32x8::splat(4);
-        let r = avx2::_mm256_min_epi32(a, b);
+        let r = unsafe { avx2::_mm256_min_epi32(a, b) };
        assert_eq!(r, a);
    }

@@ -1609,7 +1608,7 @@ fn _mm256_min_epi32() {
    fn _mm256_min_epi8() {
        let a = i8x32::splat(2);
        let b = i8x32::splat(4);
-        let r = avx2::_mm256_min_epi8(a, b);
+        let r = unsafe { avx2::_mm256_min_epi8(a, b) };
        assert_eq!(r, a);
    }

@@ -1617,7 +1616,7 @@ fn _mm256_min_epi8() {
    fn _mm256_min_epu16() {
        let a = u16x16::splat(2);
        let b = u16x16::splat(4);
-        let r = avx2::_mm256_min_epu16(a, b);
+        let r = unsafe { avx2::_mm256_min_epu16(a, b) };
        assert_eq!(r, a);
    }

@@ -1625,7 +1624,7 @@ fn _mm256_min_epu16() {
    fn _mm256_min_epu32() {
        let a = u32x8::splat(2);
        let b = u32x8::splat(4);
-        let r = avx2::_mm256_min_epu32(a, b);
+        let r = unsafe { avx2::_mm256_min_epu32(a, b) };
        assert_eq!(r, a);
    }

@@ -1633,7 +1632,7 @@ fn _mm256_min_epu32() {
    fn _mm256_min_epu8() {
        let a = u8x32::splat(2);
        let b = u8x32::splat(4);
-        let r = avx2::_mm256_min_epu8(a, b);
+        let r = unsafe { avx2::_mm256_min_epu8(a, b) };
        assert_eq!(r, a);
    }

@@ -1665,7 +1664,7 @@ fn _mm256_mpsadbw_epu8() {
    fn _mm256_mul_epi32() {
        let a = i32x8::new(0, 0, 0, 0, 2, 2, 2, 2);
        let b = i32x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = avx2::_mm256_mul_epi32(a, b);
+        let r = unsafe { avx2::_mm256_mul_epi32(a, b) };
        let e = i64x4::new(0, 0, 10, 14);
        assert_eq!(r, e);
    }
@@ -1674,7 +1673,7 @@ fn _mm256_mul_epi32() {
    fn _mm256_mul_epu32() {
        let a = u32x8::new(0, 0, 0, 0, 2, 2, 2, 2);
        let b = u32x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = avx2::_mm256_mul_epu32(a, b);
+        let r = unsafe { avx2::_mm256_mul_epu32(a, b) };
        let e = u64x4::new(0, 0, 10, 14);
        assert_eq!(r, e);
    }
@@ -1683,7 +1682,7 @@ fn _mm256_mul_epu32() {
    fn _mm256_mulhi_epi16() {
        let a = i16x16::splat(6535);
        let b = i16x16::splat(6535);
-        let r = avx2::_mm256_mulhi_epi16(a, b);
+        let r = unsafe { avx2::_mm256_mulhi_epi16(a, b) };
        let e = i16x16::splat(651);
        assert_eq!(r, e);
    }
@@ -1692,7 +1691,7 @@ fn _mm256_mulhi_epi16() {
    fn _mm256_mulhi_epu16() {
        let a = u16x16::splat(6535);
        let b = u16x16::splat(6535);
-        let r = avx2::_mm256_mulhi_epu16(a, b);
+        let r = unsafe { avx2::_mm256_mulhi_epu16(a, b) };
        let e = u16x16::splat(651);
        assert_eq!(r, e);
    }
@@ -1701,7 +1700,7 @@ fn _mm256_mulhi_epu16() {
    fn _mm256_mullo_epi16() {
        let a = i16x16::splat(2);
        let b = i16x16::splat(4);
-        let r = avx2::_mm256_mullo_epi16(a, b);
+        let r = unsafe { avx2::_mm256_mullo_epi16(a, b) };
        let e = i16x16::splat(8);
        assert_eq!(r, e);
    }
@@ -1710,7 +1709,7 @@ fn _mm256_mullo_epi16() {
    fn _mm256_mullo_epi32() {
        let a = i32x8::splat(2);
        let b = i32x8::splat(4);
-        let r = avx2::_mm256_mullo_epi32(a, b);
+        let r = unsafe { avx2::_mm256_mullo_epi32(a, b) };
        let e = i32x8::splat(8);
        assert_eq!(r, e);
    }
@@ -1719,7 +1718,7 @@ fn _mm256_mullo_epi32() {
    fn _mm256_mulhrs_epi16() {
        let a = i16x16::splat(2);
        let b = i16x16::splat(4);
-        let r = avx2::_mm256_mullo_epi16(a, b);
+        let r = unsafe { avx2::_mm256_mullo_epi16(a, b) };
        let e = i16x16::splat(8);
        assert_eq!(r, e);
    }
@@ -1728,7 +1727,7 @@ fn _mm256_mulhrs_epi16() {
    fn _mm256_or_si256() {
        let a = __m256i::splat(-1);
        let b = __m256i::splat(0);
-        let r = avx2::_mm256_or_si256(a, b);
+        let r = unsafe { avx2::_mm256_or_si256(a, b) };
        assert_eq!(r, a);
    }

@@ -1736,7 +1735,7 @@ fn _mm256_or_si256() {
    fn _mm256_packs_epi16() {
        let a = i16x16::splat(2);
        let b = i16x16::splat(4);
-        let r = avx2::_mm256_packs_epi16(a, b);
+        let r = unsafe { avx2::_mm256_packs_epi16(a, b) };
        let e = i8x32::new(
            2, 2, 2, 2, 2, 2, 2, 2,
            4, 4, 4, 4, 4, 4, 4, 4,
@@ -1750,7 +1749,7 @@ fn _mm256_packs_epi16() {
    fn _mm256_packs_epi32() {
        let a = i32x8::splat(2);
        let b = i32x8::splat(4);
-        let r = avx2::_mm256_packs_epi32(a, b);
+        let r = unsafe { avx2::_mm256_packs_epi32(a, b) };
        let e = i16x16::new(
            2, 2, 2, 2,
            4, 4, 4, 4,
@@ -1764,7 +1763,7 @@ fn _mm256_packs_epi32() {
    fn _mm256_packus_epi16() {
        let a = i16x16::splat(2);
        let b = i16x16::splat(4);
-        let r = avx2::_mm256_packus_epi16(a, b);
+        let r = unsafe { avx2::_mm256_packus_epi16(a, b) };
        let e = u8x32::new(
            2, 2, 2, 2, 2, 2, 2, 2,
            4, 4, 4, 4, 4, 4, 4, 4,
@@ -1778,7 +1777,7 @@ fn _mm256_packus_epi16() {
    fn _mm256_packus_epi32() {
        let a = i32x8::splat(2);
        let b = i32x8::splat(4);
-        let r = avx2::_mm256_packus_epi32(a, b);
+        let r = unsafe { avx2::_mm256_packus_epi32(a, b) };
        let e = u16x16::new(
            2, 2, 2, 2,
            4, 4, 4, 4,
@@ -1792,7 +1791,7 @@ fn _mm256_packus_epi32() {
    fn _mm256_sad_epu8() {
        let a = u8x32::splat(2);
        let b = u8x32::splat(4);
-        let r = avx2::_mm256_sad_epu8(a, b);
+        let r = unsafe { avx2::_mm256_sad_epu8(a, b) };
        let e = u64x4::splat(16);
        assert_eq!(r, e);
    }
@@ -1801,7 +1800,7 @@ fn _mm256_sad_epu8() {
    fn _mm256_sign_epi16() {
        let a = i16x16::splat(2);
        let b = i16x16::splat(-1);
-        let r = avx2::_mm256_sign_epi16(a, b);
+        let r = unsafe { avx2::_mm256_sign_epi16(a, b) };
        let e = i16x16::splat(-2);
        assert_eq!(r, e);
    }
@@ -1810,7 +1809,7 @@ fn _mm256_sign_epi16() {
    fn _mm256_sign_epi32() {
        let a = i32x8::splat(2);
        let b = i32x8::splat(-1);
-        let r = avx2::_mm256_sign_epi32(a, b);
+        let r = unsafe { avx2::_mm256_sign_epi32(a, b) };
        let e = i32x8::splat(-2);
        assert_eq!(r, e);
    }
@@ -1819,53 +1818,53 @@ fn _mm256_sign_epi32() {
    fn _mm256_sign_epi8() {
        let a = i8x32::splat(2);
        let b = i8x32::splat(-1);
-        let r = avx2::_mm256_sign_epi8(a, b);
+        let r = unsafe { avx2::_mm256_sign_epi8(a, b) };
        let e = i8x32::splat(-2);
        assert_eq!(r, e);
    }

    #[simd_test = "avx2"]
    fn _mm256_sll_epi16() {
-         assert_eq!(
-            avx2::_mm256_sll_epi16(i16x16::splat(0xFF), i16x8::splat(0).replace(0,4)),
-            i16x16::splat(0xFF0));
-
+        let a = i16x16::splat(0xFF);
+        let b = i16x8::splat(0).replace(0, 4);
+        let r = unsafe { avx2::_mm256_sll_epi16(a, b) };
+        assert_eq!(r, i16x16::splat(0xFF0));
    }

    #[simd_test = "avx2"]
    fn _mm256_sll_epi32() {
-         assert_eq!(
-            avx2::_mm256_sll_epi32(i32x8::splat(0xFFFF), i32x4::splat(0).replace(0,4)),
-            i32x8::splat(0xFFFF0));
-
+        let a = i32x8::splat(0xFFFF);
+        let b = i32x4::splat(0).replace(0, 4);
+        let r = unsafe { avx2::_mm256_sll_epi32(a, b) };
+        assert_eq!(r, i32x8::splat(0xFFFF0));
    }

    #[simd_test = "avx2"]
    fn _mm256_sll_epi64() {
-         assert_eq!(
-            avx2::_mm256_sll_epi64(i64x4::splat(0xFFFFFFFF), i64x2::splat(0).replace(0,4)),
-            i64x4::splat(0xFFFFFFFF0));
-
+        let a = i64x4::splat(0xFFFFFFFF);
+        let b = i64x2::splat(0).replace(0, 4);
+        let r = unsafe { avx2::_mm256_sll_epi64(a, b) };
+        assert_eq!(r, i64x4::splat(0xFFFFFFFF0));
    }

    #[simd_test = "avx2"]
    fn _mm256_slli_epi16() {
        assert_eq!(
-            avx2::_mm256_slli_epi16(i16x16::splat(0xFF), 4),
+            unsafe { avx2::_mm256_slli_epi16(i16x16::splat(0xFF), 4) },
            i16x16::splat(0xFF0));
    }

    #[simd_test = "avx2"]
    fn _mm256_slli_epi32() {
        assert_eq!(
-            avx2::_mm256_slli_epi32(i32x8::splat(0xFFFF), 4),
+            unsafe { avx2::_mm256_slli_epi32(i32x8::splat(0xFFFF), 4) },
            i32x8::splat(0xFFFF0));
    }

    #[simd_test = "avx2"]
    fn _mm256_slli_epi64() {
        assert_eq!(
-            avx2::_mm256_slli_epi64(i64x4::splat(0xFFFFFFFF), 4),
+            unsafe { avx2::_mm256_slli_epi64(i64x4::splat(0xFFFFFFFF), 4) },
            i64x4::splat(0xFFFFFFFF0));
    }

@@ -1873,7 +1872,7 @@ fn _mm256_slli_epi64() {
    fn _mm_sllv_epi32() {
        let a = i32x4::splat(2);
        let b = i32x4::splat(1);
-        let r = avx2::_mm_sllv_epi32(a, b);
+        let r = unsafe { avx2::_mm_sllv_epi32(a, b) };
        let e = i32x4::splat(4);
        assert_eq!(r, e);
    }
@@ -1882,7 +1881,7 @@ fn _mm_sllv_epi32() {
    fn _mm256_sllv_epi32() {
        let a = i32x8::splat(2);
        let b = i32x8::splat(1);
-        let r = avx2::_mm256_sllv_epi32(a, b);
+        let r = unsafe { avx2::_mm256_sllv_epi32(a, b) };
        let e = i32x8::splat(4);
        assert_eq!(r, e);
    }
@@ -1891,7 +1890,7 @@ fn _mm256_sllv_epi32() {
    fn _mm_sllv_epi64() {
        let a = i64x2::splat(2);
        let b = i64x2::splat(1);
-        let r = avx2::_mm_sllv_epi64(a, b);
+        let r = unsafe { avx2::_mm_sllv_epi64(a, b) };
        let e = i64x2::splat(4);
        assert_eq!(r, e);
    }
@@ -1900,46 +1899,46 @@ fn _mm_sllv_epi64() {
    fn _mm256_sllv_epi64() {
        let a = i64x4::splat(2);
        let b = i64x4::splat(1);
-        let r = avx2::_mm256_sllv_epi64(a, b);
+        let r = unsafe { avx2::_mm256_sllv_epi64(a, b) };
        let e = i64x4::splat(4);
        assert_eq!(r, e);
    }

    #[simd_test = "avx2"]
    fn _mm256_sra_epi16() {
-         assert_eq!(
-            avx2::_mm256_sra_epi16(
-                i16x16::splat(-1), i16x8::new(1, 0, 0, 0, 0, 0, 0, 0)),
-            i16x16::splat(-1));
+        let a = i16x16::splat(-1);
+        let b = i16x8::new(1, 0, 0, 0, 0, 0, 0, 0);
+        let r = unsafe { avx2::_mm256_sra_epi16(a, b) };
+        assert_eq!(r, i16x16::splat(-1));
    }

    #[simd_test = "avx2"]
    fn _mm256_sra_epi32() {
-         assert_eq!(
-            avx2::_mm256_sra_epi32(
-                i32x8::splat(-1), i32x4::splat(0).replace(0,1)),
-            i32x8::splat(-1));
+        let a = i32x8::splat(-1);
+        let b = i32x4::splat(0).replace(0, 1);
+        let r = unsafe { avx2::_mm256_sra_epi32(a, b) };
+        assert_eq!(r, i32x8::splat(-1));
    }

    #[simd_test = "avx2"]
    fn _mm256_srai_epi16() {
-           assert_eq!(
-            avx2::_mm256_srai_epi16(
-                i16x16::splat(-1), 1), i16x16::splat(-1));
+        assert_eq!(
+            unsafe { avx2::_mm256_srai_epi16(i16x16::splat(-1), 1) },
+            i16x16::splat(-1));
    }

    #[simd_test = "avx2"]
    fn _mm256_srai_epi32() {
-           assert_eq!(
-            avx2::_mm256_srai_epi32(
-                i32x8::splat(-1), 1), i32x8::splat(-1));
+        assert_eq!(
+            unsafe { avx2::_mm256_srai_epi32(i32x8::splat(-1), 1) },
+            i32x8::splat(-1));
    }

    #[simd_test = "avx2"]
    fn _mm_srav_epi32() {
        let a = i32x4::splat(4);
        let count = i32x4::splat(1);
-        let r = avx2::_mm_srav_epi32(a, count);
+        let r = unsafe { avx2::_mm_srav_epi32(a, count) };
        let e = i32x4::splat(2);
        assert_eq!(r, e );
    }
@@ -1948,53 +1947,53 @@ fn _mm_srav_epi32() {
    fn _mm256_srav_epi32() {
        let a = i32x8::splat(4);
        let count = i32x8::splat(1);
-        let r = avx2::_mm256_srav_epi32(a, count);
+        let r = unsafe { avx2::_mm256_srav_epi32(a, count) };
        let e = i32x8::splat(2);
        assert_eq!(r, e );
    }

    #[simd_test = "avx2"]
    fn _mm256_srl_epi16() {
-        assert_eq!(
-            avx2::_mm256_srl_epi16(
-                i16x16::splat(0xFF), i16x8::splat(0).replace(0,4)),
-            i16x16::splat(0xF));
+        let a = i16x16::splat(0xFF);
+        let b = i16x8::splat(0).replace(0, 4);
+        let r = unsafe { avx2::_mm256_srl_epi16(a, b) };
+        assert_eq!(r, i16x16::splat(0xF));
    }

    #[simd_test = "avx2"]
    fn _mm256_srl_epi32() {
-        assert_eq!(
-            avx2::_mm256_srl_epi32(
-                i32x8::splat(0xFFFF), i32x4::splat(0).replace(0,4)),
-            i32x8::splat(0xFFF));
+        let a = i32x8::splat(0xFFFF);
+        let b = i32x4::splat(0).replace(0, 4);
+        let r = unsafe { avx2::_mm256_srl_epi32(a, b) };
+        assert_eq!(r, i32x8::splat(0xFFF));
    }

    #[simd_test = "avx2"]
    fn _mm256_srl_epi64() {
-        assert_eq!(
-            avx2::_mm256_srl_epi64(
-                i64x4::splat(0xFFFFFFFF), i64x2::splat(0).replace(0,4)),
-            i64x4::splat(0xFFFFFFF));
+        let a = i64x4::splat(0xFFFFFFFF);
+        let b = i64x2::splat(0).replace(0, 4);
+        let r = unsafe { avx2::_mm256_srl_epi64(a, b) };
+        assert_eq!(r, i64x4::splat(0xFFFFFFF));
    }

    #[simd_test = "avx2"]
    fn _mm256_srli_epi16() {
        assert_eq!(
-            avx2::_mm256_srli_epi16(i16x16::splat(0xFF), 4),
+            unsafe { avx2::_mm256_srli_epi16(i16x16::splat(0xFF), 4) },
            i16x16::splat(0xF));
    }

    #[simd_test = "avx2"]
    fn _mm256_srli_epi32() {
        assert_eq!(
-            avx2::_mm256_srli_epi32(i32x8::splat(0xFFFF), 4),
+            unsafe { avx2::_mm256_srli_epi32(i32x8::splat(0xFFFF), 4) },
            i32x8::splat(0xFFF));
    }

    #[simd_test = "avx2"]
    fn _mm256_srli_epi64() {
        assert_eq!(
-            avx2::_mm256_srli_epi64(i64x4::splat(0xFFFFFFFF), 4),
+            unsafe { avx2::_mm256_srli_epi64(i64x4::splat(0xFFFFFFFF), 4) },
            i64x4::splat(0xFFFFFFF));
    }

@@ -2002,7 +2001,7 @@ fn _mm256_srli_epi64() {
    fn _mm_srlv_epi32() {
        let a = i32x4::splat(2);
        let count = i32x4::splat(1);
-        let r = avx2::_mm_srlv_epi32(a, count);
+        let r = unsafe { avx2::_mm_srlv_epi32(a, count) };
        let e = i32x4::splat(1);
        assert_eq!(r, e);
    }
@@ -2011,7 +2010,7 @@ fn _mm_srlv_epi32() {
    fn _mm256_srlv_epi32() {
        let a = i32x8::splat(2);
        let count = i32x8::splat(1);
-        let r = avx2::_mm256_srlv_epi32(a, count);
+        let r = unsafe { avx2::_mm256_srlv_epi32(a, count) };
        let e = i32x8::splat(1);
        assert_eq!(r, e);
    }
@@ -2020,7 +2019,7 @@ fn _mm256_srlv_epi32() {
    fn _mm_srlv_epi64() {
        let a = i64x2::splat(2);
        let count = i64x2::splat(1);
-        let r = avx2::_mm_srlv_epi64(a, count);
+        let r = unsafe { avx2::_mm_srlv_epi64(a, count) };
        let e = i64x2::splat(1);
        assert_eq!(r, e);
    }
@@ -2030,7 +2029,7 @@ fn _mm_srlv_epi64() {
    fn _mm256_srlv_epi64() {
        let a = i64x4::splat(2);
        let count = i64x4::splat(1);
-        let r = avx2::_mm256_srlv_epi64(a, count);
+        let r = unsafe { avx2::_mm256_srlv_epi64(a, count) };
        let e = i64x4::splat(1);
        assert_eq!(r, e);
    }
@@ -2039,7 +2038,7 @@ fn _mm256_srlv_epi64() {
    fn _mm256_sub_epi16() {
        let a = i16x16::splat(4);
        let b = i16x16::splat(2);
-        let r = avx2::_mm256_sub_epi16(a, b);
+        let r = unsafe { avx2::_mm256_sub_epi16(a, b) };
        assert_eq!(r, b);
    }

@@ -2047,7 +2046,7 @@ fn _mm256_sub_epi16() {
    fn _mm256_sub_epi32() {
        let a = i32x8::splat(4);
        let b = i32x8::splat(2);
-        let r = avx2::_mm256_sub_epi32(a, b);
+        let r = unsafe { avx2::_mm256_sub_epi32(a, b) };
        assert_eq!(r, b);
    }

@@ -2055,7 +2054,7 @@ fn _mm256_sub_epi32() {
    fn _mm256_sub_epi64() {
        let a = i64x4::splat(4);
        let b = i64x4::splat(2);
-        let r = avx2::_mm256_sub_epi64(a, b);
+        let r = unsafe { avx2::_mm256_sub_epi64(a, b) };
        assert_eq!(r, b);
    }

@@ -2063,7 +2062,7 @@ fn _mm256_sub_epi64() {
    fn _mm256_sub_epi8() {
        let a = i8x32::splat(4);
        let b = i8x32::splat(2);
-        let r = avx2::_mm256_sub_epi8(a, b);
+        let r = unsafe { avx2::_mm256_sub_epi8(a, b) };
        assert_eq!(r, b);
    }

@@ -2071,7 +2070,7 @@ fn _mm256_sub_epi8() {
    fn _mm256_subs_epi16() {
        let a = i16x16::splat(4);
        let b = i16x16::splat(2);
-        let r = avx2::_mm256_subs_epi16(a, b);
+        let r = unsafe { avx2::_mm256_subs_epi16(a, b) };
        assert_eq!(r, b);
    }

@@ -2079,7 +2078,7 @@ fn _mm256_subs_epi16() {
    fn _mm256_subs_epi8() {
        let a = i8x32::splat(4);
        let b = i8x32::splat(2);
-        let r = avx2::_mm256_subs_epi8(a, b);
+        let r = unsafe { avx2::_mm256_subs_epi8(a, b) };
        assert_eq!(r, b);
    }

@@ -2087,7 +2086,7 @@ fn _mm256_subs_epi8() {
    fn _mm256_subs_epu16() {
        let a = u16x16::splat(4);
        let b = u16x16::splat(2);
-        let r = avx2::_mm256_subs_epu16(a, b);
+        let r = unsafe { avx2::_mm256_subs_epu16(a, b) };
        assert_eq!(r, b);
    }

@@ -2095,14 +2094,15 @@ fn _mm256_subs_epu16() {
    fn _mm256_subs_epu8() {
        let a = u8x32::splat(4);
        let b = u8x32::splat(2);
-        let r = avx2::_mm256_subs_epu8(a, b);
+        let r = unsafe { avx2::_mm256_subs_epu8(a, b) };
        assert_eq!(r, b);
    }

    #[simd_test = "avx2"]
    fn _mm256_xor_si256() {
-        assert_eq!(
-            avx2::_mm256_xor_si256(__m256i::splat(5), __m256i::splat(3)),
-            __m256i::splat(6));
+        let a = __m256i::splat(5);
+        let b = __m256i::splat(3);
+        let r = unsafe { avx2::_mm256_xor_si256(a, b) };
+        assert_eq!(r, __m256i::splat(6));
    }
 }
@@ -10,20 +10,12 @@
 #[cfg(test)]
 use stdsimd_test::assert_instr;

-#[allow(dead_code)]
-extern "C" {
-    #[link_name="llvm.x86.bmi.bextr.32"]
-    fn x86_bmi_bextr_32(x: u32, y: u32) -> u32;
-    #[link_name="llvm.x86.bmi.bextr.64"]
-    fn x86_bmi_bextr_64(x: u64, y: u64) -> u64;
-}
-
 /// Extracts bits in range [`start`, `start` + `length`) from `a` into
 /// the least significant bits of the result.
 #[inline(always)]
 #[target_feature = "+bmi"]
 #[cfg_attr(test, assert_instr(bextr))]
-pub fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 {
+pub unsafe fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 {
    _bextr2_u32(a, (start & 0xffu32) | ((len & 0xffu32) << 8u32))
 }

@@ -33,7 +25,7 @@ pub fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 {
 #[target_feature = "+bmi"]
 #[cfg_attr(test, assert_instr(bextr))]
 #[cfg(not(target_arch = "x86"))]
-pub fn _bextr_u64(a: u64, start: u64, len: u64) -> u64 {
+pub unsafe fn _bextr_u64(a: u64, start: u64, len: u64) -> u64 {
    _bextr2_u64(a, (start & 0xffu64) | ((len & 0xffu64) << 8u64))
 }

@@ -45,8 +37,8 @@ pub fn _bextr_u64(a: u64, start: u64, len: u64) -> u64 {
 #[inline(always)]
 #[target_feature = "+bmi"]
 #[cfg_attr(test, assert_instr(bextr))]
-pub fn _bextr2_u32(a: u32, control: u32) -> u32 {
-    unsafe { x86_bmi_bextr_32(a, control) }
+pub unsafe fn _bextr2_u32(a: u32, control: u32) -> u32 {
+    x86_bmi_bextr_32(a, control)
 }

 /// Extracts bits of `a` specified by `control` into
@@ -58,15 +50,15 @@ pub fn _bextr2_u32(a: u32, control: u32) -> u32 {
 #[target_feature = "+bmi"]
 #[cfg_attr(test, assert_instr(bextr))]
 #[cfg(not(target_arch = "x86"))]
-pub fn _bextr2_u64(a: u64, control: u64) -> u64 {
-    unsafe { x86_bmi_bextr_64(a, control) }
+pub unsafe fn _bextr2_u64(a: u64, control: u64) -> u64 {
+    x86_bmi_bextr_64(a, control)
 }

 /// Bitwise logical `AND` of inverted `a` with `b`.
 #[inline(always)]
 #[target_feature = "+bmi"]
 #[cfg_attr(test, assert_instr(andn))]
-pub fn _andn_u32(a: u32, b: u32) -> u32 {
+pub unsafe fn _andn_u32(a: u32, b: u32) -> u32 {
    !a & b
 }

@@ -74,7 +66,7 @@ pub fn _andn_u32(a: u32, b: u32) -> u32 {
 #[inline(always)]
 #[target_feature = "+bmi"]
 #[cfg_attr(test, assert_instr(andn))]
-pub fn _andn_u64(a: u64, b: u64) -> u64 {
+pub unsafe fn _andn_u64(a: u64, b: u64) -> u64 {
    !a & b
 }

@@ -82,7 +74,7 @@ pub fn _andn_u64(a: u64, b: u64) -> u64 {
 #[inline(always)]
 #[target_feature = "+bmi"]
 #[cfg_attr(test, assert_instr(blsi))]
-pub fn _blsi_u32(x: u32) -> u32 {
+pub unsafe fn _blsi_u32(x: u32) -> u32 {
    x & x.wrapping_neg()
 }

@@ -91,7 +83,7 @@ pub fn _blsi_u32(x: u32) -> u32 {
 #[target_feature = "+bmi"]
 #[cfg_attr(test, assert_instr(blsi))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
-pub fn _blsi_u64(x: u64) -> u64 {
+pub unsafe fn _blsi_u64(x: u64) -> u64 {
    x & x.wrapping_neg()
 }

@@ -99,7 +91,7 @@ pub fn _blsi_u64(x: u64) -> u64 {
 #[inline(always)]
 #[target_feature = "+bmi"]
 #[cfg_attr(test, assert_instr(blsmsk))]
-pub fn _blsmsk_u32(x: u32) -> u32 {
+pub unsafe fn _blsmsk_u32(x: u32) -> u32 {
    x ^ (x.wrapping_sub(1u32))
 }

@@ -108,7 +100,7 @@ pub fn _blsmsk_u32(x: u32) -> u32 {
 #[target_feature = "+bmi"]
 #[cfg_attr(test, assert_instr(blsmsk))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
-pub fn _blsmsk_u64(x: u64) -> u64 {
+pub unsafe fn _blsmsk_u64(x: u64) -> u64 {
    x ^ (x.wrapping_sub(1u64))
 }

@@ -118,7 +110,7 @@ pub fn _blsmsk_u64(x: u64) -> u64 {
 #[inline(always)]
 #[target_feature = "+bmi"]
 #[cfg_attr(test, assert_instr(blsr))]
-pub fn _blsr_u32(x: u32) -> u32 {
+pub unsafe fn _blsr_u32(x: u32) -> u32 {
    x & (x.wrapping_sub(1))
 }

@@ -129,7 +121,7 @@ pub fn _blsr_u32(x: u32) -> u32 {
 #[target_feature = "+bmi"]
 #[cfg_attr(test, assert_instr(blsr))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
-pub fn _blsr_u64(x: u64) -> u64 {
+pub unsafe fn _blsr_u64(x: u64) -> u64 {
    x & (x.wrapping_sub(1))
 }

@@ -139,7 +131,7 @@ pub fn _blsr_u64(x: u64) -> u64 {
 #[inline(always)]
 #[target_feature = "+bmi"]
 #[cfg_attr(test, assert_instr(tzcnt))]
-pub fn _tzcnt_u16(x: u16) -> u16 {
+pub unsafe fn _tzcnt_u16(x: u16) -> u16 {
    x.trailing_zeros() as u16
 }

@@ -149,7 +141,7 @@ pub fn _tzcnt_u16(x: u16) -> u16 {
 #[inline(always)]
 #[target_feature = "+bmi"]
 #[cfg_attr(test, assert_instr(tzcnt))]
-pub fn _tzcnt_u32(x: u32) -> u32 {
+pub unsafe fn _tzcnt_u32(x: u32) -> u32 {
    x.trailing_zeros()
 }

@@ -159,7 +151,7 @@ pub fn _tzcnt_u32(x: u32) -> u32 {
 #[inline(always)]
 #[target_feature = "+bmi"]
 #[cfg_attr(test, assert_instr(tzcnt))]
-pub fn _tzcnt_u64(x: u64) -> u64 {
+pub unsafe fn _tzcnt_u64(x: u64) -> u64 {
    x.trailing_zeros() as u64
 }

@@ -169,7 +161,7 @@ pub fn _tzcnt_u64(x: u64) -> u64 {
 #[inline(always)]
 #[target_feature = "+bmi"]
 #[cfg_attr(test, assert_instr(tzcnt))]
-pub fn _mm_tzcnt_u32(x: u32) -> u32 {
+pub unsafe fn _mm_tzcnt_u32(x: u32) -> u32 {
    x.trailing_zeros()
 }

@@ -179,10 +171,18 @@ pub fn _mm_tzcnt_u32(x: u32) -> u32 {
 #[inline(always)]
 #[target_feature = "+bmi"]
 #[cfg_attr(test, assert_instr(tzcnt))]
-pub fn _mm_tzcnt_u64(x: u64) -> u64 {
+pub unsafe fn _mm_tzcnt_u64(x: u64) -> u64 {
    x.trailing_zeros() as u64
 }

+#[allow(dead_code)]
+extern "C" {
+    #[link_name="llvm.x86.bmi.bextr.32"]
+    fn x86_bmi_bextr_32(x: u32, y: u32) -> u32;
+    #[link_name="llvm.x86.bmi.bextr.64"]
+    fn x86_bmi_bextr_64(x: u64, y: u64) -> u64;
+}
+
 #[cfg(test)]
 mod tests {
    use stdsimd_test::simd_test;
@@ -191,98 +191,122 @@ mod tests {

    #[simd_test = "bmi"]
    fn _bextr_u32() {
-        assert_eq!(bmi::_bextr_u32(0b0101_0000u32, 4, 4), 0b0000_0101u32);
+        let r = unsafe { bmi::_bextr_u32(0b0101_0000u32, 4, 4) };
+        assert_eq!(r, 0b0000_0101u32);
    }

    #[simd_test = "bmi"]
    #[cfg(not(target_arch = "x86"))]
    fn _bextr_u64() {
-        assert_eq!(bmi::_bextr_u64(0b0101_0000u64, 4, 4), 0b0000_0101u64);
+        let r = unsafe { bmi::_bextr_u64(0b0101_0000u64, 4, 4) };
+        assert_eq!(r, 0b0000_0101u64);
    }

    #[simd_test = "bmi"]
    fn _andn_u32() {
-        assert_eq!(bmi::_andn_u32(0, 0), 0);
-        assert_eq!(bmi::_andn_u32(0, 1), 1);
-        assert_eq!(bmi::_andn_u32(1, 0), 0);
-        assert_eq!(bmi::_andn_u32(1, 1), 0);
+        assert_eq!(unsafe { bmi::_andn_u32(0, 0) }, 0);
+        assert_eq!(unsafe { bmi::_andn_u32(0, 1) }, 1);
+        assert_eq!(unsafe { bmi::_andn_u32(1, 0) }, 0);
+        assert_eq!(unsafe { bmi::_andn_u32(1, 1) }, 0);

-        assert_eq!(bmi::_andn_u32(0b0000_0000u32, 0b0000_0000u32), 0b0000_0000u32);
-        assert_eq!(bmi::_andn_u32(0b0000_0000u32, 0b1111_1111u32), 0b1111_1111u32);
-        assert_eq!(bmi::_andn_u32(0b1111_1111u32, 0b0000_0000u32), 0b0000_0000u32);
-        assert_eq!(bmi::_andn_u32(0b1111_1111u32, 0b1111_1111u32), 0b0000_0000u32);
-        assert_eq!(bmi::_andn_u32(0b0100_0000u32, 0b0101_1101u32), 0b0001_1101u32);
+        let r = unsafe { bmi::_andn_u32(0b0000_0000u32, 0b0000_0000u32) };
+        assert_eq!(r, 0b0000_0000u32);
+
+        let r = unsafe { bmi::_andn_u32(0b0000_0000u32, 0b1111_1111u32) };
+        assert_eq!(r, 0b1111_1111u32);
+
+        let r = unsafe { bmi::_andn_u32(0b1111_1111u32, 0b0000_0000u32) };
+        assert_eq!(r, 0b0000_0000u32);
+
+        let r = unsafe { bmi::_andn_u32(0b1111_1111u32, 0b1111_1111u32) };
+        assert_eq!(r, 0b0000_0000u32);
+
+        let r = unsafe { bmi::_andn_u32(0b0100_0000u32, 0b0101_1101u32) };
+        assert_eq!(r, 0b0001_1101u32);
    }

    #[simd_test = "bmi"]
    #[cfg(not(target_arch = "x86"))]
    fn _andn_u64() {
-        assert_eq!(bmi::_andn_u64(0, 0), 0);
-        assert_eq!(bmi::_andn_u64(0, 1), 1);
-        assert_eq!(bmi::_andn_u64(1, 0), 0);
-        assert_eq!(bmi::_andn_u64(1, 1), 0);
+        assert_eq!(unsafe { bmi::_andn_u64(0, 0) }, 0);
+        assert_eq!(unsafe { bmi::_andn_u64(0, 1) }, 1);
+        assert_eq!(unsafe { bmi::_andn_u64(1, 0) }, 0);
+        assert_eq!(unsafe { bmi::_andn_u64(1, 1) }, 0);

-        assert_eq!(bmi::_andn_u64(0b0000_0000u64, 0b0000_0000u64), 0b0000_0000u64);
-        assert_eq!(bmi::_andn_u64(0b0000_0000u64, 0b1111_1111u64), 0b1111_1111u64);
-        assert_eq!(bmi::_andn_u64(0b1111_1111u64, 0b0000_0000u64), 0b0000_0000u64);
-        assert_eq!(bmi::_andn_u64(0b1111_1111u64, 0b1111_1111u64), 0b0000_0000u64);
-        assert_eq!(bmi::_andn_u64(0b0100_0000u64, 0b0101_1101u64), 0b0001_1101u64);
+        let r = unsafe { bmi::_andn_u64(0b0000_0000u64, 0b0000_0000u64) };
+        assert_eq!(r, 0b0000_0000u64);
+
+        let r = unsafe { bmi::_andn_u64(0b0000_0000u64, 0b1111_1111u64) };
+        assert_eq!(r, 0b1111_1111u64);
+
+        let r = unsafe { bmi::_andn_u64(0b1111_1111u64, 0b0000_0000u64) };
+        assert_eq!(r, 0b0000_0000u64);
+
+        let r = unsafe { bmi::_andn_u64(0b1111_1111u64, 0b1111_1111u64) };
+        assert_eq!(r, 0b0000_0000u64);
+
+        let r = unsafe { bmi::_andn_u64(0b0100_0000u64, 0b0101_1101u64) };
+        assert_eq!(r, 0b0001_1101u64);
    }

    #[simd_test = "bmi"]
    fn _blsi_u32() {
-        assert_eq!(bmi::_blsi_u32(0b1101_0000u32), 0b0001_0000u32);
+        assert_eq!(unsafe { bmi::_blsi_u32(0b1101_0000u32) }, 0b0001_0000u32);
    }

    #[simd_test = "bmi"]
    #[cfg(not(target_arch = "x86"))]
    fn _blsi_u64() {
-        assert_eq!(bmi::_blsi_u64(0b1101_0000u64), 0b0001_0000u64);
+        assert_eq!(unsafe { bmi::_blsi_u64(0b1101_0000u64) }, 0b0001_0000u64);
    }

    #[simd_test = "bmi"]
    fn _blsmsk_u32() {
-        assert_eq!(bmi::_blsmsk_u32(0b0011_0000u32), 0b0001_1111u32);
+        let r = unsafe { bmi::_blsmsk_u32(0b0011_0000u32) };
+        assert_eq!(r, 0b0001_1111u32);
    }

    #[simd_test = "bmi"]
    #[cfg(not(target_arch = "x86"))]
    fn _blsmsk_u64() {
-        assert_eq!(bmi::_blsmsk_u64(0b0011_0000u64), 0b0001_1111u64);
+        let r = unsafe { bmi::_blsmsk_u64(0b0011_0000u64) };
+        assert_eq!(r, 0b0001_1111u64);
    }

    #[simd_test = "bmi"]
    fn _blsr_u32() {
-        /// TODO: test the behavior when the input is 0
-        assert_eq!(bmi::_blsr_u32(0b0011_0000u32), 0b0010_0000u32);
+        // TODO: test the behavior when the input is 0
+        let r = unsafe { bmi::_blsr_u32(0b0011_0000u32) };
+        assert_eq!(r, 0b0010_0000u32);
    }

    #[simd_test = "bmi"]
    #[cfg(not(target_arch = "x86"))]
    fn _blsr_u64() {
-        /// TODO: test the behavior when the input is 0
-        assert_eq!(bmi::_blsr_u64(0b0011_0000u64), 0b0010_0000u64);
+        // TODO: test the behavior when the input is 0
+        let r = unsafe { bmi::_blsr_u64(0b0011_0000u64) };
+        assert_eq!(r, 0b0010_0000u64);
    }

    #[simd_test = "bmi"]
    fn _tzcnt_u16() {
-        assert_eq!(bmi::_tzcnt_u16(0b0000_0001u16), 0u16);
-        assert_eq!(bmi::_tzcnt_u16(0b0000_0000u16), 16u16);
-        assert_eq!(bmi::_tzcnt_u16(0b1001_0000u16), 4u16);
+        assert_eq!(unsafe { bmi::_tzcnt_u16(0b0000_0001u16) }, 0u16);
+        assert_eq!(unsafe { bmi::_tzcnt_u16(0b0000_0000u16) }, 16u16);
+        assert_eq!(unsafe { bmi::_tzcnt_u16(0b1001_0000u16) }, 4u16);
    }

    #[simd_test = "bmi"]
    fn _tzcnt_u32() {
-        assert_eq!(bmi::_tzcnt_u32(0b0000_0001u32), 0u32);
-        assert_eq!(bmi::_tzcnt_u32(0b0000_0000u32), 32u32);
-        assert_eq!(bmi::_tzcnt_u32(0b1001_0000u32), 4u32);
+        assert_eq!(unsafe { bmi::_tzcnt_u32(0b0000_0001u32) }, 0u32);
+        assert_eq!(unsafe { bmi::_tzcnt_u32(0b0000_0000u32) }, 32u32);
+        assert_eq!(unsafe { bmi::_tzcnt_u32(0b1001_0000u32) }, 4u32);
    }

    #[simd_test = "bmi"]
    #[cfg(not(target_arch = "x86"))]
    fn _tzcnt_u64() {
-        assert_eq!(bmi::_tzcnt_u64(0b0000_0001u64), 0u64);
-        assert_eq!(bmi::_tzcnt_u64(0b0000_0000u64), 64u64);
-        assert_eq!(bmi::_tzcnt_u64(0b1001_0000u64), 4u64);
+        assert_eq!(unsafe { bmi::_tzcnt_u64(0b0000_0001u64) }, 0u64);
+        assert_eq!(unsafe { bmi::_tzcnt_u64(0b0000_0000u64) }, 64u64);
+        assert_eq!(unsafe { bmi::_tzcnt_u64(0b1001_0000u64) }, 4u64);
    }
 }
@@ -19,7 +19,7 @@
 #[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(imul))]
 #[cfg_attr(all(test, target_arch = "x86"), assert_instr(mulx))]
 #[target_feature = "+bmi2"]
-pub fn _mulx_u32(a: u32, b: u32) -> (u32, u32) {
+pub unsafe fn _mulx_u32(a: u32, b: u32) -> (u32, u32) {
    let result: u64 = (a as u64) * (b as u64);
    let hi = (result >> 32) as u32;
    (result as u32, hi)
@@ -33,12 +33,67 @@ pub fn _mulx_u32(a: u32, b: u32) -> (u32, u32) {
 #[cfg_attr(test, assert_instr(mulx))]
 #[target_feature = "+bmi2"]
 #[cfg(not(target_arch = "x86"))] // calls an intrinsic
-pub fn _mulx_u64(a: u64, b: u64) -> (u64, u64) {
+pub unsafe fn _mulx_u64(a: u64, b: u64) -> (u64, u64) {
    let result: u128 = (a as u128) * (b as u128);
    let hi = (result >> 64) as u64;
    (result as u64, hi)
 }

+/// Zero higher bits of `a` >= `index`.
+#[inline(always)]
+#[target_feature = "+bmi2"]
+#[cfg_attr(test, assert_instr(bzhi))]
+pub unsafe fn _bzhi_u32(a: u32, index: u32) -> u32 {
+    x86_bmi2_bzhi_32(a, index)
+}
+
+/// Zero higher bits of `a` >= `index`.
+#[inline(always)]
+#[target_feature = "+bmi2"]
+#[cfg_attr(test, assert_instr(bzhi))]
+#[cfg(not(target_arch = "x86"))]
+pub unsafe fn _bzhi_u64(a: u64, index: u64) -> u64 {
+    x86_bmi2_bzhi_64(a, index)
+}
+
+/// Scatter contiguous low order bits of `a` to the result at the positions
+/// specified by the `mask`.
+#[inline(always)]
+#[target_feature = "+bmi2"]
+#[cfg_attr(test, assert_instr(pdep))]
+pub unsafe fn _pdep_u32(a: u32, mask: u32) -> u32 {
+    x86_bmi2_pdep_32(a, mask)
+}
+
+/// Scatter contiguous low order bits of `a` to the result at the positions
+/// specified by the `mask`.
+#[inline(always)]
+#[target_feature = "+bmi2"]
+#[cfg_attr(test, assert_instr(pdep))]
+#[cfg(not(target_arch = "x86"))]
+pub unsafe fn _pdep_u64(a: u64, mask: u64) -> u64 {
+    x86_bmi2_pdep_64(a, mask)
+}
+
+/// Gathers the bits of `x` specified by the `mask` into the contiguous low
+/// order bit positions of the result.
+#[inline(always)]
+#[target_feature = "+bmi2"]
+#[cfg_attr(test, assert_instr(pext))]
+pub unsafe fn _pext_u32(a: u32, mask: u32) -> u32 {
+    x86_bmi2_pext_32(a, mask)
+}
+
+/// Gathers the bits of `x` specified by the `mask` into the contiguous low
+/// order bit positions of the result.
+#[inline(always)]
+#[target_feature = "+bmi2"]
+#[cfg_attr(test, assert_instr(pext))]
+#[cfg(not(target_arch = "x86"))]
+pub unsafe fn _pext_u64(a: u64, mask: u64) -> u64 {
+    x86_bmi2_pext_64(a, mask)
+}
+
 #[allow(dead_code)]
 extern "C" {
    #[link_name="llvm.x86.bmi.bzhi.32"]
@@ -55,63 +110,6 @@ pub fn _mulx_u64(a: u64, b: u64) -> (u64, u64) {
    fn x86_bmi2_pext_64(x: u64, y: u64) -> u64;
 }

-
-/// Zero higher bits of `a` >= `index`.
-#[inline(always)]
-#[target_feature = "+bmi2"]
-#[cfg_attr(test, assert_instr(bzhi))]
-pub fn _bzhi_u32(a: u32, index: u32) -> u32 {
-    unsafe { x86_bmi2_bzhi_32(a, index) }
-}
-
-/// Zero higher bits of `a` >= `index`.
-#[inline(always)]
-#[target_feature = "+bmi2"]
-#[cfg_attr(test, assert_instr(bzhi))]
-#[cfg(not(target_arch = "x86"))]
-pub fn _bzhi_u64(a: u64, index: u64) -> u64 {
-    unsafe { x86_bmi2_bzhi_64(a, index) }
-}
-
-
-/// Scatter contiguous low order bits of `a` to the result at the positions
-/// specified by the `mask`.
-#[inline(always)]
-#[target_feature = "+bmi2"]
-#[cfg_attr(test, assert_instr(pdep))]
-pub fn _pdep_u32(a: u32, mask: u32) -> u32 {
-    unsafe { x86_bmi2_pdep_32(a, mask) }
-}
-
-/// Scatter contiguous low order bits of `a` to the result at the positions
-/// specified by the `mask`.
-#[inline(always)]
-#[target_feature = "+bmi2"]
-#[cfg_attr(test, assert_instr(pdep))]
-#[cfg(not(target_arch = "x86"))]
-pub fn _pdep_u64(a: u64, mask: u64) -> u64 {
-    unsafe { x86_bmi2_pdep_64(a, mask) }
-}
-
-/// Gathers the bits of `x` specified by the `mask` into the contiguous low
-/// order bit positions of the result.
-#[inline(always)]
-#[target_feature = "+bmi2"]
-#[cfg_attr(test, assert_instr(pext))]
-pub fn _pext_u32(a: u32, mask: u32) -> u32 {
-    unsafe { x86_bmi2_pext_32(a, mask) }
-}
-
-/// Gathers the bits of `x` specified by the `mask` into the contiguous low
-/// order bit positions of the result.
-#[inline(always)]
-#[target_feature = "+bmi2"]
-#[cfg_attr(test, assert_instr(pext))]
-#[cfg(not(target_arch = "x86"))]
-pub fn _pext_u64(a: u64, mask: u64) -> u64 {
-    unsafe { x86_bmi2_pext_64(a, mask) }
-}
-
 #[cfg(test)]
 mod tests {
    use stdsimd_test::simd_test;
@@ -128,8 +126,8 @@ fn _pext_u32() {
        let m1 = 0b1110_1011_1110_1111u32;
        let s1 = 0b0001_0111_0100_0011u32;

-        assert_eq!(bmi2::_pext_u32(n, m0), s0);
-        assert_eq!(bmi2::_pext_u32(n, m1), s1);
+        assert_eq!(unsafe { bmi2::_pext_u32(n, m0) }, s0);
+        assert_eq!(unsafe { bmi2::_pext_u32(n, m1) }, s1);
    }

    #[simd_test = "bmi2"]
@@ -143,8 +141,8 @@ fn _pext_u64() {
        let m1 = 0b1110_1011_1110_1111u64;
        let s1 = 0b0001_0111_0100_0011u64;

-        assert_eq!(bmi2::_pext_u64(n, m0), s0);
-        assert_eq!(bmi2::_pext_u64(n, m1), s1);
+        assert_eq!(unsafe { bmi2::_pext_u64(n, m0) }, s0);
+        assert_eq!(unsafe { bmi2::_pext_u64(n, m1) }, s1);
    }

    #[simd_test = "bmi2"]
@@ -157,8 +155,8 @@ fn _pdep_u32() {
        let m1 = 0b1110_1011_1110_1111u32;
        let s1 = 0b1110_1001_0010_0011u32;

-        assert_eq!(bmi2::_pdep_u32(n, m0), s0);
-        assert_eq!(bmi2::_pdep_u32(n, m1), s1);
+        assert_eq!(unsafe { bmi2::_pdep_u32(n, m0) }, s0);
+        assert_eq!(unsafe { bmi2::_pdep_u32(n, m1) }, s1);
    }

    #[simd_test = "bmi2"]
@@ -172,15 +170,15 @@ fn _pdep_u64() {
        let m1 = 0b1110_1011_1110_1111u64;
        let s1 = 0b1110_1001_0010_0011u64;

-        assert_eq!(bmi2::_pdep_u64(n, m0), s0);
-        assert_eq!(bmi2::_pdep_u64(n, m1), s1);
+        assert_eq!(unsafe { bmi2::_pdep_u64(n, m0) }, s0);
+        assert_eq!(unsafe { bmi2::_pdep_u64(n, m1) }, s1);
    }

    #[simd_test = "bmi2"]
    fn _bzhi_u32() {
        let n = 0b1111_0010u32;
        let s = 0b0001_0010u32;
-        assert_eq!(bmi2::_bzhi_u32(n, 5), s);
+        assert_eq!(unsafe { bmi2::_bzhi_u32(n, 5) }, s);
    }

    #[simd_test = "bmi2"]
@@ -188,14 +186,14 @@ fn _bzhi_u32() {
    fn _bzhi_u64() {
        let n = 0b1111_0010u64;
        let s = 0b0001_0010u64;
-        assert_eq!(bmi2::_bzhi_u64(n, 5), s);
+        assert_eq!(unsafe { bmi2::_bzhi_u64(n, 5) }, s);
    }

    #[simd_test = "bmi2"]
    fn _mulx_u32() {
        let a: u32 = 4_294_967_200;
        let b: u32 = 2;
-        let (lo, hi): (u32, u32)  = bmi2::_mulx_u32(a, b);
+        let (lo, hi): (u32, u32) = unsafe { bmi2::_mulx_u32(a, b) };
        // result = 8589934400
        //        = 0b0001_1111_1111_1111_1111_1111_1111_0100_0000u64
        //            ^~hi ^~lo~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -208,7 +206,7 @@ fn _mulx_u32() {
    fn _mulx_u64() {
        let a: u64 = 9_223_372_036_854_775_800;
        let b: u64 = 100;
-        let (lo, hi): (u64, u64)  = bmi2::_mulx_u64(a, b);
+        let (lo, hi): (u64, u64) = unsafe { bmi2::_mulx_u64(a, b) };
        // result = 922337203685477580000
        //        = 0b00110001_11111111_11111111_11111111_11111111_11111111_11111111_11111100_11100000u128
        //            ^~hi~~~~ ^~lo~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -9,15 +9,15 @@
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(addss))]
-pub fn _mm_add_ss(a: f32x4, b: f32x4) -> f32x4 {
-    unsafe { addss(a, b) }
+pub unsafe fn _mm_add_ss(a: f32x4, b: f32x4) -> f32x4 {
+    addss(a, b)
 }

 /// Adds f32x4 vectors.
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(addps))]
-pub fn _mm_add_ps(a: f32x4, b: f32x4) -> f32x4 {
+pub unsafe fn _mm_add_ps(a: f32x4, b: f32x4) -> f32x4 {
    a + b
 }

@@ -26,15 +26,15 @@ pub fn _mm_add_ps(a: f32x4, b: f32x4) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(subss))]
-pub fn _mm_sub_ss(a: f32x4, b: f32x4) -> f32x4 {
-    unsafe { subss(a, b) }
+pub unsafe fn _mm_sub_ss(a: f32x4, b: f32x4) -> f32x4 {
+    subss(a, b)
 }

 /// Subtracts f32x4 vectors.
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(subps))]
-pub fn _mm_sub_ps(a: f32x4, b: f32x4) -> f32x4 {
+pub unsafe fn _mm_sub_ps(a: f32x4, b: f32x4) -> f32x4 {
    a - b
 }

@@ -43,15 +43,15 @@ pub fn _mm_sub_ps(a: f32x4, b: f32x4) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(mulss))]
-pub fn _mm_mul_ss(a: f32x4, b: f32x4) -> f32x4 {
-    unsafe { mulss(a, b) }
+pub unsafe fn _mm_mul_ss(a: f32x4, b: f32x4) -> f32x4 {
+    mulss(a, b)
 }

 /// Multiplies f32x4 vectors.
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(mulps))]
-pub fn _mm_mul_ps(a: f32x4, b: f32x4) -> f32x4 {
+pub unsafe fn _mm_mul_ps(a: f32x4, b: f32x4) -> f32x4 {
    a * b
 }

@@ -60,15 +60,15 @@ pub fn _mm_mul_ps(a: f32x4, b: f32x4) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(divss))]
-pub fn _mm_div_ss(a: f32x4, b: f32x4) -> f32x4 {
-    unsafe { divss(a, b) }
+pub unsafe fn _mm_div_ss(a: f32x4, b: f32x4) -> f32x4 {
+    divss(a, b)
 }

 /// Divides f32x4 vectors.
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(divps))]
-pub fn _mm_div_ps(a: f32x4, b: f32x4) -> f32x4 {
+pub unsafe fn _mm_div_ps(a: f32x4, b: f32x4) -> f32x4 {
    a / b
 }

@@ -77,8 +77,8 @@ pub fn _mm_div_ps(a: f32x4, b: f32x4) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(sqrtss))]
-pub fn _mm_sqrt_ss(a: f32x4) -> f32x4 {
-    unsafe { sqrtss(a) }
+pub unsafe fn _mm_sqrt_ss(a: f32x4) -> f32x4 {
+    sqrtss(a)
 }

 /// Return the square root of packed single-precision (32-bit) floating-point
@@ -86,8 +86,8 @@ pub fn _mm_sqrt_ss(a: f32x4) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(sqrtps))]
-pub fn _mm_sqrt_ps(a: f32x4) -> f32x4 {
-    unsafe { sqrtps(a) }
+pub unsafe fn _mm_sqrt_ps(a: f32x4) -> f32x4 {
+    sqrtps(a)
 }

 /// Return the approximate reciprocal of the first single-precision
@@ -95,8 +95,8 @@ pub fn _mm_sqrt_ps(a: f32x4) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(rcpss))]
-pub fn _mm_rcp_ss(a: f32x4) -> f32x4 {
-    unsafe { rcpss(a) }
+pub unsafe fn _mm_rcp_ss(a: f32x4) -> f32x4 {
+    rcpss(a)
 }

 /// Return the approximate reciprocal of packed single-precision (32-bit)
@@ -104,8 +104,8 @@ pub fn _mm_rcp_ss(a: f32x4) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(rcpps))]
-pub fn _mm_rcp_ps(a: f32x4) -> f32x4 {
-    unsafe { rcpps(a) }
+pub unsafe fn _mm_rcp_ps(a: f32x4) -> f32x4 {
+    rcpps(a)
 }

 /// Return the approximate reciprocal square root of the fist single-precision
@@ -113,8 +113,8 @@ pub fn _mm_rcp_ps(a: f32x4) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(rsqrtss))]
-pub fn _mm_rsqrt_ss(a: f32x4) -> f32x4 {
-    unsafe { rsqrtss(a) }
+pub unsafe fn _mm_rsqrt_ss(a: f32x4) -> f32x4 {
+    rsqrtss(a)
 }

 /// Return the approximate reciprocal square root of packed single-precision
@@ -122,8 +122,8 @@ pub fn _mm_rsqrt_ss(a: f32x4) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(rsqrtps))]
-pub fn _mm_rsqrt_ps(a: f32x4) -> f32x4 {
-    unsafe { rsqrtps(a) }
+pub unsafe fn _mm_rsqrt_ps(a: f32x4) -> f32x4 {
+    rsqrtps(a)
 }

 /// Compare the first single-precision (32-bit) floating-point element of `a`
@@ -132,8 +132,8 @@ pub fn _mm_rsqrt_ps(a: f32x4) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(minss))]
-pub fn _mm_min_ss(a: f32x4, b: f32x4) -> f32x4 {
-    unsafe { minss(a, b) }
+pub unsafe fn _mm_min_ss(a: f32x4, b: f32x4) -> f32x4 {
+    minss(a, b)
 }

 /// Compare packed single-precision (32-bit) floating-point elements in `a` and
@@ -141,8 +141,8 @@ pub fn _mm_min_ss(a: f32x4, b: f32x4) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(minps))]
-pub fn _mm_min_ps(a: f32x4, b: f32x4) -> f32x4 {
-    unsafe { minps(a, b) }
+pub unsafe fn _mm_min_ps(a: f32x4, b: f32x4) -> f32x4 {
+    minps(a, b)
 }

 /// Compare the first single-precision (32-bit) floating-point element of `a`
@@ -151,8 +151,8 @@ pub fn _mm_min_ps(a: f32x4, b: f32x4) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(maxss))]
-pub fn _mm_max_ss(a: f32x4, b: f32x4) -> f32x4 {
-    unsafe { maxss(a, b) }
+pub unsafe fn _mm_max_ss(a: f32x4, b: f32x4) -> f32x4 {
+    maxss(a, b)
 }

 /// Compare packed single-precision (32-bit) floating-point elements in `a` and
@@ -160,24 +160,23 @@ pub fn _mm_max_ss(a: f32x4, b: f32x4) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(maxps))]
-pub fn _mm_max_ps(a: f32x4, b: f32x4) -> f32x4 {
-    unsafe { maxps(a, b) }
+pub unsafe fn _mm_max_ps(a: f32x4, b: f32x4) -> f32x4 {
+    maxps(a, b)
 }

-// Shuffle packed single-precision (32-bit) floating-point elements in `a` and `b`
-// using `mask`.
-// The lower half of result takes values from `a` and the higher half from `b`.
-// Mask is split to 2 control bits each to index the element from inputs.
+/// Shuffle packed single-precision (32-bit) floating-point elements in `a` and
+/// `b` using `mask`.
+///
+/// The lower half of result takes values from `a` and the higher half from
+/// `b`. Mask is split to 2 control bits each to index the element from inputs.
 #[inline(always)]
 #[target_feature = "+sse"]
-pub fn _mm_shuffle_ps(a: f32x4, b: f32x4, mask: i32) -> f32x4 {
+pub unsafe fn _mm_shuffle_ps(a: f32x4, b: f32x4, mask: i32) -> f32x4 {
    let mask = (mask & 0xFF) as u8;

    macro_rules! shuffle_done {
        ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
-            unsafe {
-                simd_shuffle4(a, b, [$x01, $x23, $x45, $x67])
-            }
+            simd_shuffle4(a, b, [$x01, $x23, $x45, $x67])
        }
    }
    macro_rules! shuffle_x67 {
@@ -219,10 +218,10 @@ macro_rules! shuffle_x23 {
 }

 #[cfg(test)]
-#[cfg_attr(test, assert_instr(shufps))]
 #[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(shufps))]
 fn _test_mm_shuffle_ps(a: f32x4, b: f32x4) -> f32x4 {
-    _mm_shuffle_ps(a, b, 3)
+    unsafe { _mm_shuffle_ps(a, b, 3) }
 }

 /// Unpack and interleave single-precision (32-bit) floating-point elements
@@ -230,8 +229,8 @@ fn _test_mm_shuffle_ps(a: f32x4, b: f32x4) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(unpckhps))]
-pub fn _mm_unpackhi_ps(a: f32x4, b: f32x4) -> f32x4 {
-    unsafe { simd_shuffle4(a, b, [2, 6, 3, 7]) }
+pub unsafe fn _mm_unpackhi_ps(a: f32x4, b: f32x4) -> f32x4 {
+    simd_shuffle4(a, b, [2, 6, 3, 7])
 }

 /// Unpack and interleave single-precision (32-bit) floating-point elements
@@ -239,8 +238,8 @@ pub fn _mm_unpackhi_ps(a: f32x4, b: f32x4) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(unpcklps))]
-pub fn _mm_unpacklo_ps(a: f32x4, b: f32x4) -> f32x4 {
-    unsafe { simd_shuffle4(a, b, [0, 4, 1, 5]) }
+pub unsafe fn _mm_unpacklo_ps(a: f32x4, b: f32x4) -> f32x4 {
+    simd_shuffle4(a, b, [0, 4, 1, 5])
 }

 /// Combine higher half of `a` and `b`. The highwe half of `b` occupies the lower
@@ -249,9 +248,9 @@ pub fn _mm_unpacklo_ps(a: f32x4, b: f32x4) -> f32x4 {
 #[target_feature = "+sse"]
 #[cfg_attr(all(test, not(windows)), assert_instr(movhlps))]
 #[cfg_attr(all(test, windows), assert_instr(unpckhpd))]
-pub fn _mm_movehl_ps(a: f32x4, b: f32x4) -> f32x4 {
+pub unsafe fn _mm_movehl_ps(a: f32x4, b: f32x4) -> f32x4 {
    // TODO; figure why this is a different instruction on Windows?
-    unsafe { simd_shuffle4(a, b, [6, 7, 2, 3]) }
+    simd_shuffle4(a, b, [6, 7, 2, 3])
 }

 /// Combine lower half of `a` and `b`. The lower half of `b` occupies the higher
@@ -259,8 +258,8 @@ pub fn _mm_movehl_ps(a: f32x4, b: f32x4) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(unpcklpd))]
-pub fn _mm_movelh_ps(a: f32x4, b: f32x4) -> f32x4 {
-    unsafe { simd_shuffle4(a, b, [0, 1, 4, 5]) }
+pub unsafe fn _mm_movelh_ps(a: f32x4, b: f32x4) -> f32x4 {
+    simd_shuffle4(a, b, [0, 1, 4, 5])
 }

 /// Return a mask of the most significant bit of each element in `a`.
@@ -270,8 +269,8 @@ pub fn _mm_movelh_ps(a: f32x4, b: f32x4) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(movmskps))]
-pub fn _mm_movemask_ps(a: f32x4) -> i32 {
-    unsafe { movmskps(a) }
+pub unsafe fn _mm_movemask_ps(a: f32x4) -> i32 {
+    movmskps(a)
 }

 #[allow(improper_ctypes)]
@@ -318,7 +317,7 @@ mod tests {
    fn _mm_add_ps() {
        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
-        let r = sse::_mm_add_ps(a, b);
+        let r = unsafe { sse::_mm_add_ps(a, b) };
        assert_eq!(r, f32x4::new(-101.0, 25.0, 0.0, -15.0));
    }

@@ -326,7 +325,7 @@ fn _mm_add_ps() {
    fn _mm_add_ss() {
        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
-        let r = sse::_mm_add_ss(a, b);
+        let r = unsafe { sse::_mm_add_ss(a, b) };
        assert_eq!(r, f32x4::new(-101.0, 5.0, 0.0, -10.0));
    }

@@ -334,7 +333,7 @@ fn _mm_add_ss() {
    fn _mm_sub_ps() {
        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
-        let r = sse::_mm_sub_ps(a, b);
+        let r = unsafe { sse::_mm_sub_ps(a, b) };
        assert_eq!(r, f32x4::new(99.0, -15.0, 0.0, -5.0));
    }

@@ -342,7 +341,7 @@ fn _mm_sub_ps() {
    fn _mm_sub_ss() {
        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
-        let r = sse::_mm_sub_ss(a, b);
+        let r = unsafe { sse::_mm_sub_ss(a, b) };
        assert_eq!(r, f32x4::new(99.0, 5.0, 0.0, -10.0));
    }

@@ -350,7 +349,7 @@ fn _mm_sub_ss() {
    fn _mm_mul_ps() {
        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
-        let r = sse::_mm_mul_ps(a, b);
+        let r = unsafe { sse::_mm_mul_ps(a, b) };
        assert_eq!(r, f32x4::new(100.0, 100.0, 0.0, 50.0));
    }

@@ -358,7 +357,7 @@ fn _mm_mul_ps() {
    fn _mm_mul_ss() {
        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
-        let r = sse::_mm_mul_ss(a, b);
+        let r = unsafe { sse::_mm_mul_ss(a, b) };
        assert_eq!(r, f32x4::new(100.0, 5.0, 0.0, -10.0));
    }

@@ -366,7 +365,7 @@ fn _mm_mul_ss() {
    fn _mm_div_ps() {
        let a = f32x4::new(-1.0, 5.0, 2.0, -10.0);
        let b = f32x4::new(-100.0, 20.0, 0.2, -5.0);
-        let r = sse::_mm_div_ps(a, b);
+        let r = unsafe { sse::_mm_div_ps(a, b) };
        assert_eq!(r, f32x4::new(0.01, 0.25, 10.0, 2.0));
    }

@@ -374,14 +373,14 @@ fn _mm_div_ps() {
    fn _mm_div_ss() {
        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
-        let r = sse::_mm_div_ss(a, b);
+        let r = unsafe { sse::_mm_div_ss(a, b) };
        assert_eq!(r, f32x4::new(0.01, 5.0, 0.0, -10.0));
    }

    #[simd_test = "sse"]
    fn _mm_sqrt_ss() {
        let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
-        let r = sse::_mm_sqrt_ss(a);
+        let r = unsafe { sse::_mm_sqrt_ss(a) };
        let e = f32x4::new(2.0, 13.0, 16.0, 100.0);
        assert_eq!(r, e);
    }
@@ -389,7 +388,7 @@ fn _mm_sqrt_ss() {
    #[simd_test = "sse"]
    fn _mm_sqrt_ps() {
        let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
-        let r = sse::_mm_sqrt_ps(a);
+        let r = unsafe { sse::_mm_sqrt_ps(a) };
        let e = f32x4::new(2.0, 3.6055512, 4.0, 10.0);
        assert_eq!(r, e);
    }
@@ -397,7 +396,7 @@ fn _mm_sqrt_ps() {
    #[simd_test = "sse"]
    fn _mm_rcp_ss() {
        let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
-        let r = sse::_mm_rcp_ss(a);
+        let r = unsafe { sse::_mm_rcp_ss(a) };
        let e = f32x4::new(0.24993896, 13.0, 16.0, 100.0);
        assert_eq!(r, e);
    }
@@ -405,7 +404,7 @@ fn _mm_rcp_ss() {
    #[simd_test = "sse"]
    fn _mm_rcp_ps() {
        let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
-        let r = sse::_mm_rcp_ps(a);
+        let r = unsafe { sse::_mm_rcp_ps(a) };
        let e = f32x4::new(0.24993896, 0.0769043, 0.06248474, 0.0099983215);
        assert_eq!(r, e);
    }
@@ -413,7 +412,7 @@ fn _mm_rcp_ps() {
    #[simd_test = "sse"]
    fn _mm_rsqrt_ss() {
        let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
-        let r = sse::_mm_rsqrt_ss(a);
+        let r = unsafe { sse::_mm_rsqrt_ss(a) };
        let e = f32x4::new(0.49987793, 13.0, 16.0, 100.0);
        assert_eq!(r, e);
    }
@@ -421,7 +420,7 @@ fn _mm_rsqrt_ss() {
    #[simd_test = "sse"]
    fn _mm_rsqrt_ps() {
        let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
-        let r = sse::_mm_rsqrt_ps(a);
+        let r = unsafe { sse::_mm_rsqrt_ps(a) };
        let e = f32x4::new(0.49987793, 0.2772827, 0.24993896, 0.099990845);
        assert_eq!(r, e);
    }
@@ -430,7 +429,7 @@ fn _mm_rsqrt_ps() {
    fn _mm_min_ss() {
        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
-        let r = sse::_mm_min_ss(a, b);
+        let r = unsafe { sse::_mm_min_ss(a, b) };
        assert_eq!(r, f32x4::new(-100.0, 5.0, 0.0, -10.0));
    }

@@ -438,7 +437,7 @@ fn _mm_min_ss() {
    fn _mm_min_ps() {
        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
-        let r = sse::_mm_min_ps(a, b);
+        let r = unsafe { sse::_mm_min_ps(a, b) };
        assert_eq!(r, f32x4::new(-100.0, 5.0, 0.0, -10.0));
    }

@@ -446,7 +445,7 @@ fn _mm_min_ps() {
    fn _mm_max_ss() {
        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
-        let r = sse::_mm_max_ss(a, b);
+        let r = unsafe { sse::_mm_max_ss(a, b) };
        assert_eq!(r, f32x4::new(-1.0, 5.0, 0.0, -10.0));
    }

@@ -454,7 +453,7 @@ fn _mm_max_ss() {
    fn _mm_max_ps() {
        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
-        let r = sse::_mm_max_ps(a, b);
+        let r = unsafe { sse::_mm_max_ps(a, b) };
        assert_eq!(r, f32x4::new(-1.0, 20.0, 0.0, -5.0));
    }

@@ -463,7 +462,7 @@ fn _mm_shuffle_ps() {
        let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
        let b = f32x4::new(5.0, 6.0, 7.0, 8.0);
        let mask = 0b00_01_01_11;
-        let r = sse::_mm_shuffle_ps(a, b, mask);
+        let r = unsafe { sse::_mm_shuffle_ps(a, b, mask) };
        assert_eq!(r, f32x4::new(4.0, 2.0, 6.0, 5.0));
    }

@@ -471,7 +470,7 @@ fn _mm_shuffle_ps() {
    fn _mm_unpackhi_ps() {
        let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
        let b = f32x4::new(5.0, 6.0, 7.0, 8.0);
-        let r = sse::_mm_unpackhi_ps(a, b);
+        let r = unsafe { sse::_mm_unpackhi_ps(a, b) };
        assert_eq!(r, f32x4::new(3.0, 7.0, 4.0, 8.0));
    }

@@ -479,7 +478,7 @@ fn _mm_unpackhi_ps() {
    fn _mm_unpacklo_ps() {
        let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
        let b = f32x4::new(5.0, 6.0, 7.0, 8.0);
-        let r = sse::_mm_unpacklo_ps(a, b);
+        let r = unsafe { sse::_mm_unpacklo_ps(a, b) };
        assert_eq!(r, f32x4::new(1.0, 5.0, 2.0, 6.0));
    }

@@ -487,7 +486,7 @@ fn _mm_unpacklo_ps() {
    fn _mm_movehl_ps() {
        let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
        let b = f32x4::new(5.0, 6.0, 7.0, 8.0);
-        let r = sse::_mm_movehl_ps(a, b);
+        let r = unsafe { sse::_mm_movehl_ps(a, b) };
        assert_eq!(r, f32x4::new(7.0, 8.0, 3.0, 4.0));
    }

@@ -495,16 +494,20 @@ fn _mm_movehl_ps() {
    fn _mm_movelh_ps() {
        let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
        let b = f32x4::new(5.0, 6.0, 7.0, 8.0);
-        let r = sse::_mm_movelh_ps(a, b);
+        let r = unsafe { sse::_mm_movelh_ps(a, b) };
        assert_eq!(r, f32x4::new(1.0, 2.0, 5.0, 6.0));
    }

    #[simd_test = "sse"]
    fn _mm_movemask_ps() {
-        let r = sse::_mm_movemask_ps(f32x4::new(-1.0, 5.0, -5.0, 0.0));
+        let r = unsafe {
+            sse::_mm_movemask_ps(f32x4::new(-1.0, 5.0, -5.0, 0.0))
+        };
        assert_eq!(r, 0b0101);

-        let r = sse::_mm_movemask_ps(f32x4::new(-1.0, -5.0, -5.0, 0.0));
+        let r = unsafe {
+            sse::_mm_movemask_ps(f32x4::new(-1.0, -5.0, -5.0, 0.0))
+        };
        assert_eq!(r, 0b0111);
    }
 }
@@ -1,3 +1,6 @@
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
 use std::mem;
 use std::os::raw::c_void;
 use std::ptr;
@@ -9,23 +12,22 @@
 use v128::*;
 use v64::*;

-#[cfg(test)]
-use stdsimd_test::assert_instr;
-
 /// Provide a hint to the processor that the code sequence is a spin-wait loop.
 ///
 /// This can help improve the performance and power consumption of spin-wait
 /// loops.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_pause() {
-    unsafe { pause() }
+#[cfg_attr(test, assert_instr(pause))]
+pub unsafe fn _mm_pause() {
+    pause()
 }

 /// Invalidate and flush the cache line that contains `p` from all levels of
 /// the cache hierarchy.
 #[inline(always)]
 #[target_feature = "+sse2"]
+#[cfg_attr(test, assert_instr(clflush))]
 pub unsafe fn _mm_clflush(p: *mut c_void) {
    clflush(p)
 }
@@ -38,8 +40,9 @@ pub unsafe fn _mm_clflush(p: *mut c_void) {
 /// program order.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_lfence() {
-    unsafe { lfence() }
+#[cfg_attr(test, assert_instr(lfence))]
+pub unsafe fn _mm_lfence() {
+    lfence()
 }

 /// Perform a serializing operation on all load-from-memory and store-to-memory
@@ -50,79 +53,89 @@ pub fn _mm_lfence() {
 /// which follows the fence in program order.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_mfence() {
-    unsafe { mfence() }
+#[cfg_attr(test, assert_instr(mfence))]
+pub unsafe fn _mm_mfence() {
+    mfence()
 }

 /// Add packed 8-bit integers in `a` and `b`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_add_epi8(a: i8x16, b: i8x16) -> i8x16 {
+#[cfg_attr(test, assert_instr(paddb))]
+pub unsafe fn _mm_add_epi8(a: i8x16, b: i8x16) -> i8x16 {
    a + b
 }

 /// Add packed 16-bit integers in `a` and `b`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_add_epi16(a: i16x8, b: i16x8) -> i16x8 {
+#[cfg_attr(test, assert_instr(paddw))]
+pub unsafe fn _mm_add_epi16(a: i16x8, b: i16x8) -> i16x8 {
    a + b
 }

 /// Add packed 32-bit integers in `a` and `b`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_add_epi32(a: i32x4, b: i32x4) -> i32x4 {
+#[cfg_attr(test, assert_instr(paddd))]
+pub unsafe fn _mm_add_epi32(a: i32x4, b: i32x4) -> i32x4 {
    a + b
 }

 /// Add packed 64-bit integers in `a` and "b`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_add_epi64(a: i64x2, b: i64x2) -> i64x2 {
+#[cfg_attr(test, assert_instr(paddq))]
+pub unsafe fn _mm_add_epi64(a: i64x2, b: i64x2) -> i64x2 {
    a + b
 }

 /// Add packed 8-bit integers in `a` and `b` using saturation.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_adds_epi8(a: i8x16, b: i8x16) -> i8x16 {
-    unsafe { paddsb(a, b) }
+#[cfg_attr(test, assert_instr(paddsb))]
+pub unsafe fn _mm_adds_epi8(a: i8x16, b: i8x16) -> i8x16 {
+    paddsb(a, b)
 }

 /// Add packed 16-bit integers in `a` and `b` using saturation.
 #[inline(always)]
 #[target_feature = "+sse2"]
 #[cfg_attr(test, assert_instr(paddsw))]
-pub fn _mm_adds_epi16(a: i16x8, b: i16x8) -> i16x8 {
-    unsafe { paddsw(a, b) }
+pub unsafe fn _mm_adds_epi16(a: i16x8, b: i16x8) -> i16x8 {
+    paddsw(a, b)
 }

 /// Add packed unsigned 8-bit integers in `a` and `b` using saturation.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_adds_epu8(a: u8x16, b: u8x16) -> u8x16 {
-    unsafe { paddsub(a, b) }
+#[cfg_attr(test, assert_instr(paddusb))]
+pub unsafe fn _mm_adds_epu8(a: u8x16, b: u8x16) -> u8x16 {
+    paddsub(a, b)
 }

 /// Add packed unsigned 16-bit integers in `a` and `b` using saturation.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_adds_epu16(a: u16x8, b: u16x8) -> u16x8 {
-    unsafe { paddsuw(a, b) }
+#[cfg_attr(test, assert_instr(paddusw))]
+pub unsafe fn _mm_adds_epu16(a: u16x8, b: u16x8) -> u16x8 {
+    paddsuw(a, b)
 }

 /// Average packed unsigned 8-bit integers in `a` and `b`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_avg_epu8(a: u8x16, b: u8x16) -> u8x16 {
-    unsafe { pavgb(a, b) }
+#[cfg_attr(test, assert_instr(pavgb))]
+pub unsafe fn _mm_avg_epu8(a: u8x16, b: u8x16) -> u8x16 {
+    pavgb(a, b)
 }

 /// Average packed unsigned 16-bit integers in `a` and `b`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_avg_epu16(a: u16x8, b: u16x8) -> u16x8 {
-    unsafe { pavgw(a, b) }
+#[cfg_attr(test, assert_instr(pavgw))]
+pub unsafe fn _mm_avg_epu16(a: u16x8, b: u16x8) -> u16x8 {
+    pavgw(a, b)
 }

 /// Multiply and then horizontally add signed 16 bit integers in `a` and `b`.
@@ -132,40 +145,45 @@ pub fn _mm_avg_epu16(a: u16x8, b: u16x8) -> u16x8 {
 /// intermediate 32-bit integers.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_madd_epi16(a: i16x8, b: i16x8) -> i32x4 {
-    unsafe { pmaddwd(a, b) }
+#[cfg_attr(test, assert_instr(pmaddwd))]
+pub unsafe fn _mm_madd_epi16(a: i16x8, b: i16x8) -> i32x4 {
+    pmaddwd(a, b)
 }

 /// Compare packed 16-bit integers in `a` and `b`, and return the packed
 /// maximum values.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_max_epi16(a: i16x8, b: i16x8) -> i16x8 {
-    unsafe { pmaxsw(a, b) }
+#[cfg_attr(test, assert_instr(pmaxsw))]
+pub unsafe fn _mm_max_epi16(a: i16x8, b: i16x8) -> i16x8 {
+    pmaxsw(a, b)
 }

 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return the
 /// packed maximum values.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_max_epu8(a: u8x16, b: u8x16) -> u8x16 {
-    unsafe { pmaxub(a, b) }
+#[cfg_attr(test, assert_instr(pmaxub))]
+pub unsafe fn _mm_max_epu8(a: u8x16, b: u8x16) -> u8x16 {
+    pmaxub(a, b)
 }

 /// Compare packed 16-bit integers in `a` and `b`, and return the packed
 /// minimum values.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_min_epi16(a: i16x8, b: i16x8) -> i16x8 {
-    unsafe { pminsw(a, b) }
+#[cfg_attr(test, assert_instr(pminsw))]
+pub unsafe fn _mm_min_epi16(a: i16x8, b: i16x8) -> i16x8 {
+    pminsw(a, b)
 }

 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return the
 /// packed minimum values.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_min_epu8(a: u8x16, b: u8x16) -> u8x16 {
-    unsafe { pminub(a, b) }
+#[cfg_attr(test, assert_instr(pminub))]
+pub unsafe fn _mm_min_epu8(a: u8x16, b: u8x16) -> u8x16 {
+    pminub(a, b)
 }

 /// Multiply the packed 16-bit integers in `a` and `b`.
@@ -174,8 +192,9 @@ pub fn _mm_min_epu8(a: u8x16, b: u8x16) -> u8x16 {
 /// high 16 bits of the intermediate integers.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_mulhi_epi16(a: i16x8, b: i16x8) -> i16x8 {
-    unsafe { pmulhw(a, b) }
+#[cfg_attr(test, assert_instr(pmulhw))]
+pub unsafe fn _mm_mulhi_epi16(a: i16x8, b: i16x8) -> i16x8 {
+    pmulhw(a, b)
 }

 /// Multiply the packed unsigned 16-bit integers in `a` and `b`.
@@ -184,8 +203,9 @@ pub fn _mm_mulhi_epi16(a: i16x8, b: i16x8) -> i16x8 {
 /// high 16 bits of the intermediate integers.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_mulhi_epu16(a: u16x8, b: u16x8) -> u16x8 {
-    unsafe { pmulhuw(a, b) }
+#[cfg_attr(test, assert_instr(pmulhuw))]
+pub unsafe fn _mm_mulhi_epu16(a: u16x8, b: u16x8) -> u16x8 {
+    pmulhuw(a, b)
 }

 /// Multiply the packed 16-bit integers in `a` and `b`.
@@ -194,7 +214,8 @@ pub fn _mm_mulhi_epu16(a: u16x8, b: u16x8) -> u16x8 {
 /// low 16 bits of the intermediate integers.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_mullo_epi16(a: i16x8, b: i16x8) -> i16x8 {
+#[cfg_attr(test, assert_instr(pmullw))]
+pub unsafe fn _mm_mullo_epi16(a: i16x8, b: i16x8) -> i16x8 {
    a * b
 }

@@ -204,8 +225,9 @@ pub fn _mm_mullo_epi16(a: i16x8, b: i16x8) -> i16x8 {
 /// Return the unsigned 64-bit results.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_mul_epu32(a: u32x4, b: u32x4) -> u64x2 {
-    unsafe { pmuludq(a, b) }
+#[cfg_attr(test, assert_instr(pmuludq))]
+pub unsafe fn _mm_mul_epu32(a: u32x4, b: u32x4) -> u64x2 {
+    pmuludq(a, b)
 }

 /// Sum the absolute differences of packed unsigned 8-bit integers.
@@ -216,35 +238,40 @@ pub fn _mm_mul_epu32(a: u32x4, b: u32x4) -> u64x2 {
 /// the low 16 bits of 64-bit elements returned.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_sad_epu8(a: u8x16, b: u8x16) -> u64x2 {
-    unsafe { psadbw(a, b) }
+#[cfg_attr(test, assert_instr(psadbw))]
+pub unsafe fn _mm_sad_epu8(a: u8x16, b: u8x16) -> u64x2 {
+    psadbw(a, b)
 }

 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_sub_epi8(a: i8x16, b: i8x16) -> i8x16 {
+#[cfg_attr(test, assert_instr(psubb))]
+pub unsafe fn _mm_sub_epi8(a: i8x16, b: i8x16) -> i8x16 {
    a - b
 }

 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_sub_epi16(a: i16x8, b: i16x8) -> i16x8 {
+#[cfg_attr(test, assert_instr(psubw))]
+pub unsafe fn _mm_sub_epi16(a: i16x8, b: i16x8) -> i16x8 {
    a - b
 }

 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_sub_epi32(a: i32x4, b: i32x4) -> i32x4 {
+#[cfg_attr(test, assert_instr(psubd))]
+pub unsafe fn _mm_sub_epi32(a: i32x4, b: i32x4) -> i32x4 {
    a - b
 }

 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_sub_epi64(a: i64x2, b: i64x2) -> i64x2 {
+#[cfg_attr(test, assert_instr(psubq))]
+pub unsafe fn _mm_sub_epi64(a: i64x2, b: i64x2) -> i64x2 {
    a - b
 }

@@ -252,54 +279,56 @@ pub fn _mm_sub_epi64(a: i64x2, b: i64x2) -> i64x2 {
 /// using saturation.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_subs_epi8(a: i8x16, b: i8x16) -> i8x16 {
-    unsafe { psubsb(a, b) }
+#[cfg_attr(test, assert_instr(psubsb))]
+pub unsafe fn _mm_subs_epi8(a: i8x16, b: i8x16) -> i8x16 {
+    psubsb(a, b)
 }

 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
 /// using saturation.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_subs_epi16(a: i16x8, b: i16x8) -> i16x8 {
-    unsafe { psubsw(a, b) }
+#[cfg_attr(test, assert_instr(psubsw))]
+pub unsafe fn _mm_subs_epi16(a: i16x8, b: i16x8) -> i16x8 {
+    psubsw(a, b)
 }

 /// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit
 /// integers in `a` using saturation.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_subs_epu8(a: u8x16, b: u8x16) -> u8x16 {
-    unsafe { psubusb(a, b) }
+#[cfg_attr(test, assert_instr(psubusb))]
+pub unsafe fn _mm_subs_epu8(a: u8x16, b: u8x16) -> u8x16 {
+    psubusb(a, b)
 }

 /// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit
 /// integers in `a` using saturation.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_subs_epu16(a: u16x8, b: u16x8) -> u16x8 {
-    unsafe { psubusw(a, b) }
+#[cfg_attr(test, assert_instr(psubusw))]
+pub unsafe fn _mm_subs_epu16(a: u16x8, b: u16x8) -> u16x8 {
+    psubusw(a, b)
 }

 /// Shift `a` left by `imm8` bytes while shifting in zeros.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_slli_si128(a: __m128i, imm8: i32) -> __m128i {
+pub unsafe fn _mm_slli_si128(a: __m128i, imm8: i32) -> __m128i {
    let (zero, imm8) = (__m128i::splat(0), imm8 as u32);
    const fn sub(a: u32, b: u32) -> u32 { a - b }
    macro_rules! shuffle {
        ($shift:expr) => {
-            unsafe {
-                simd_shuffle16::<__m128i, __m128i>(zero, a, [
-                    sub(16, $shift), sub(17, $shift),
-                    sub(18, $shift), sub(19, $shift),
-                    sub(20, $shift), sub(21, $shift),
-                    sub(22, $shift), sub(23, $shift),
-                    sub(24, $shift), sub(25, $shift),
-                    sub(26, $shift), sub(27, $shift),
-                    sub(28, $shift), sub(29, $shift),
-                    sub(30, $shift), sub(31, $shift),
-                ])
-            }
+            simd_shuffle16::<__m128i, __m128i>(zero, a, [
+                sub(16, $shift), sub(17, $shift),
+                sub(18, $shift), sub(19, $shift),
+                sub(20, $shift), sub(21, $shift),
+                sub(22, $shift), sub(23, $shift),
+                sub(24, $shift), sub(25, $shift),
+                sub(26, $shift), sub(27, $shift),
+                sub(28, $shift), sub(29, $shift),
+                sub(30, $shift), sub(31, $shift),
+            ])
        }
    }
    match imm8 {
@@ -315,117 +344,146 @@ macro_rules! shuffle {
    }
 }

+#[cfg(test)]
+#[target_feature = "+sse2"]
+#[cfg_attr(test, assert_instr(pslldq))]
+fn _test_mm_slli_si128(a: __m128i) -> __m128i {
+    unsafe { _mm_slli_si128(a, 1) }
+}
+
 /// Shift `a` left by `imm8` bytes while shifting in zeros.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_bslli_si128(a: __m128i, imm8: i32) -> __m128i {
+pub unsafe fn _mm_bslli_si128(a: __m128i, imm8: i32) -> __m128i {
    _mm_slli_si128(a, imm8)
 }

+#[cfg(test)]
+#[target_feature = "+sse2"]
+#[cfg_attr(test, assert_instr(pslldq))]
+fn _test_mm_bslli_si128(a: __m128i) -> __m128i {
+    unsafe { _mm_bslli_si128(a, 1) }
+}
+
 /// Shift `a` right by `imm8` bytes while shifting in zeros.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_bsrli_si128(a: __m128i, imm8: i32) -> __m128i {
+pub unsafe fn _mm_bsrli_si128(a: __m128i, imm8: i32) -> __m128i {
    _mm_srli_si128(a, imm8)
 }

+#[cfg(test)]
+#[target_feature = "+sse2"]
+#[cfg_attr(test, assert_instr(psrldq))]
+fn _test_mm_bsrli_si128(a: __m128i) -> __m128i {
+    unsafe { _mm_bsrli_si128(a, 1) }
+}
+
 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_slli_epi16(a: i16x8, imm8: i32) -> i16x8  {
-    unsafe { pslliw(a, imm8) }
+#[cfg_attr(test, assert_instr(psllw))]
+pub unsafe fn _mm_slli_epi16(a: i16x8, imm8: i32) -> i16x8  {
+    pslliw(a, imm8)
 }

 /// Shift packed 16-bit integers in `a` left by `count` while shifting in
 /// zeros.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_sll_epi16(a: i16x8, count: i16x8) -> i16x8 {
-    unsafe { psllw(a, count) }
+#[cfg_attr(test, assert_instr(psllw))]
+pub unsafe fn _mm_sll_epi16(a: i16x8, count: i16x8) -> i16x8 {
+    psllw(a, count)
 }

 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_slli_epi32(a: i32x4, imm8: i32) -> i32x4 {
-    unsafe { psllid(a, imm8) }
+#[cfg_attr(test, assert_instr(pslld))]
+pub unsafe fn _mm_slli_epi32(a: i32x4, imm8: i32) -> i32x4 {
+    psllid(a, imm8)
 }

 /// Shift packed 32-bit integers in `a` left by `count` while shifting in
 /// zeros.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_sll_epi32(a: i32x4, count: i32x4) -> i32x4 {
-    unsafe { pslld(a, count) }
+#[cfg_attr(test, assert_instr(pslld))]
+pub unsafe fn _mm_sll_epi32(a: i32x4, count: i32x4) -> i32x4 {
+    pslld(a, count)
 }

 /// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_slli_epi64(a: i64x2, imm8: i32) -> i64x2 {
-    unsafe { pslliq(a, imm8) }
+#[cfg_attr(test, assert_instr(psllq))]
+pub unsafe fn _mm_slli_epi64(a: i64x2, imm8: i32) -> i64x2 {
+    pslliq(a, imm8)
 }

 /// Shift packed 64-bit integers in `a` left by `count` while shifting in
 /// zeros.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_sll_epi64(a: i64x2, count: i64x2) -> i64x2 {
-    unsafe { psllq(a, count) }
+#[cfg_attr(test, assert_instr(psllq))]
+pub unsafe fn _mm_sll_epi64(a: i64x2, count: i64x2) -> i64x2 {
+    psllq(a, count)
 }

 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign
 /// bits.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_srai_epi16(a: i16x8, imm8: i32) -> i16x8 {
-    unsafe { psraiw(a, imm8) }
+#[cfg_attr(test, assert_instr(psraw))]
+pub unsafe fn _mm_srai_epi16(a: i16x8, imm8: i32) -> i16x8 {
+    psraiw(a, imm8)
 }

 /// Shift packed 16-bit integers in `a` right by `count` while shifting in sign
 /// bits.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_sra_epi16(a: i16x8, count: i16x8) -> i16x8 {
-    unsafe { psraw(a, count) }
+#[cfg_attr(test, assert_instr(psraw))]
+pub unsafe fn _mm_sra_epi16(a: i16x8, count: i16x8) -> i16x8 {
+    psraw(a, count)
 }

 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign
 /// bits.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_srai_epi32(a: i32x4, imm8: i32) -> i32x4 {
-    unsafe { psraid(a, imm8) }
+#[cfg_attr(test, assert_instr(psrad))]
+pub unsafe fn _mm_srai_epi32(a: i32x4, imm8: i32) -> i32x4 {
+    psraid(a, imm8)
 }

 /// Shift packed 32-bit integers in `a` right by `count` while shifting in sign
 /// bits.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_sra_epi32(a: i32x4, count: i32x4) -> i32x4 {
-    unsafe { psrad(a, count) }
+#[cfg_attr(test, assert_instr(psrad))]
+pub unsafe fn _mm_sra_epi32(a: i32x4, count: i32x4) -> i32x4 {
+    psrad(a, count)
 }

 /// Shift `a` right by `imm8` bytes while shifting in zeros.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_srli_si128(a: __m128i, imm8: i32) -> __m128i {
+pub unsafe fn _mm_srli_si128(a: __m128i, imm8: i32) -> __m128i {
    let (zero, imm8) = (__m128i::splat(0), imm8 as u32);
    const fn add(a: u32, b: u32) -> u32 { a + b }
    macro_rules! shuffle {
        ($shift:expr) => {
-            unsafe {
-                simd_shuffle16::<__m128i, __m128i>(a, zero, [
-                    add(0, $shift), add(1, $shift),
-                    add(2, $shift), add(3, $shift),
-                    add(4, $shift), add(5, $shift),
-                    add(6, $shift), add(7, $shift),
-                    add(8, $shift), add(9, $shift),
-                    add(10, $shift), add(11, $shift),
-                    add(12, $shift), add(13, $shift),
-                    add(14, $shift), add(15, $shift),
-                ])
-            }
+            simd_shuffle16::<__m128i, __m128i>(a, zero, [
+                add(0, $shift), add(1, $shift),
+                add(2, $shift), add(3, $shift),
+                add(4, $shift), add(5, $shift),
+                add(6, $shift), add(7, $shift),
+                add(8, $shift), add(9, $shift),
+                add(10, $shift), add(11, $shift),
+                add(12, $shift), add(13, $shift),
+                add(14, $shift), add(15, $shift),
+            ])
        }
    }
    match imm8 {
@@ -441,59 +499,73 @@ macro_rules! shuffle {
    }
 }

+#[cfg(test)]
+#[target_feature = "+sse2"]
+#[cfg_attr(test, assert_instr(psrldq))]
+fn _test_mm_srli_si128(a: __m128i) -> __m128i {
+    unsafe { _mm_srli_si128(a, 1) }
+}
+
 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in
 /// zeros.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_srli_epi16(a: i16x8, imm8: i32) -> i16x8  {
-    unsafe { psrliw(a, imm8) }
+#[cfg_attr(test, assert_instr(psrlw))]
+pub unsafe fn _mm_srli_epi16(a: i16x8, imm8: i32) -> i16x8  {
+    psrliw(a, imm8)
 }

 /// Shift packed 16-bit integers in `a` right by `count` while shifting in
 /// zeros.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_srl_epi16(a: i16x8, count: i16x8) -> i16x8 {
-    unsafe { psrlw(a, count) }
+#[cfg_attr(test, assert_instr(psrlw))]
+pub unsafe fn _mm_srl_epi16(a: i16x8, count: i16x8) -> i16x8 {
+    psrlw(a, count)
 }

 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in
 /// zeros.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_srli_epi32(a: i32x4, imm8: i32) -> i32x4 {
-    unsafe { psrlid(a, imm8) }
+#[cfg_attr(test, assert_instr(psrld))]
+pub unsafe fn _mm_srli_epi32(a: i32x4, imm8: i32) -> i32x4 {
+    psrlid(a, imm8)
 }

 /// Shift packed 32-bit integers in `a` right by `count` while shifting in
 /// zeros.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_srl_epi32(a: i32x4, count: i32x4) -> i32x4 {
-    unsafe { psrld(a, count) }
+#[cfg_attr(test, assert_instr(psrld))]
+pub unsafe fn _mm_srl_epi32(a: i32x4, count: i32x4) -> i32x4 {
+    psrld(a, count)
 }

 /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in
 /// zeros.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_srli_epi64(a: i64x2, imm8: i32) -> i64x2 {
-    unsafe { psrliq(a, imm8) }
+#[cfg_attr(test, assert_instr(psrlq))]
+pub unsafe fn _mm_srli_epi64(a: i64x2, imm8: i32) -> i64x2 {
+    psrliq(a, imm8)
 }

 /// Shift packed 64-bit integers in `a` right by `count` while shifting in
 /// zeros.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_srl_epi64(a: i64x2, count: i64x2) -> i64x2 {
-    unsafe { psrlq(a, count) }
+#[cfg_attr(test, assert_instr(psrlq))]
+pub unsafe fn _mm_srl_epi64(a: i64x2, count: i64x2) -> i64x2 {
+    psrlq(a, count)
 }

 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and
 /// `b`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_and_si128(a: __m128i, b: __m128i) -> __m128i {
+#[cfg_attr(test, assert_instr(andps))]
+pub unsafe fn _mm_and_si128(a: __m128i, b: __m128i) -> __m128i {
    a & b
 }

@@ -501,7 +573,8 @@ pub fn _mm_and_si128(a: __m128i, b: __m128i) -> __m128i {
 /// then AND with `b`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_andnot_si128(a: __m128i, b: __m128i) -> __m128i {
+#[cfg_attr(test, assert_instr(andnps))]
+pub unsafe fn _mm_andnot_si128(a: __m128i, b: __m128i) -> __m128i {
    (!a) & b
 }

@@ -509,7 +582,8 @@ pub fn _mm_andnot_si128(a: __m128i, b: __m128i) -> __m128i {
 /// `b`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_or_si128(a: __m128i, b: __m128i) -> __m128i {
+#[cfg_attr(test, assert_instr(orps))]
+pub unsafe fn _mm_or_si128(a: __m128i, b: __m128i) -> __m128i {
    a | b
 }

@@ -517,70 +591,80 @@ pub fn _mm_or_si128(a: __m128i, b: __m128i) -> __m128i {
 /// `b`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_xor_si128(a: __m128i, b: __m128i) -> __m128i {
+#[cfg_attr(test, assert_instr(xorps))]
+pub unsafe fn _mm_xor_si128(a: __m128i, b: __m128i) -> __m128i {
    a ^ b
 }

 /// Compare packed 8-bit integers in `a` and `b` for equality.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_cmpeq_epi8(a: i8x16, b: i8x16) -> i8x16 {
+#[cfg_attr(test, assert_instr(pcmpeqb))]
+pub unsafe fn _mm_cmpeq_epi8(a: i8x16, b: i8x16) -> i8x16 {
    a.eq(b)
 }

 /// Compare packed 16-bit integers in `a` and `b` for equality.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_cmpeq_epi16(a: i16x8, b: i16x8) -> i16x8 {
+#[cfg_attr(test, assert_instr(pcmpeqw))]
+pub unsafe fn _mm_cmpeq_epi16(a: i16x8, b: i16x8) -> i16x8 {
    a.eq(b)
 }

 /// Compare packed 32-bit integers in `a` and `b` for equality.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_cmpeq_epi32(a: i32x4, b: i32x4) -> i32x4 {
+#[cfg_attr(test, assert_instr(pcmpeqd))]
+pub unsafe fn _mm_cmpeq_epi32(a: i32x4, b: i32x4) -> i32x4 {
    a.eq(b)
 }

 /// Compare packed 8-bit integers in `a` and `b` for greater-than.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_cmpgt_epi8(a: i8x16, b: i8x16) -> i8x16 {
+#[cfg_attr(test, assert_instr(pcmpgtb))]
+pub unsafe fn _mm_cmpgt_epi8(a: i8x16, b: i8x16) -> i8x16 {
    a.gt(b)
 }

 /// Compare packed 16-bit integers in `a` and `b` for greater-than.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_cmpgt_epi16(a: i16x8, b: i16x8) -> i16x8 {
+#[cfg_attr(test, assert_instr(pcmpgtw))]
+pub unsafe fn _mm_cmpgt_epi16(a: i16x8, b: i16x8) -> i16x8 {
    a.gt(b)
 }

 /// Compare packed 32-bit integers in `a` and `b` for greater-than.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_cmpgt_epi32(a: i32x4, b: i32x4) -> i32x4 {
+#[cfg_attr(test, assert_instr(pcmpgtd))]
+pub unsafe fn _mm_cmpgt_epi32(a: i32x4, b: i32x4) -> i32x4 {
    a.gt(b)
 }

 /// Compare packed 8-bit integers in `a` and `b` for less-than.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_cmplt_epi8(a: i8x16, b: i8x16) -> i8x16 {
+#[cfg_attr(test, assert_instr(pcmpgtb))]
+pub unsafe fn _mm_cmplt_epi8(a: i8x16, b: i8x16) -> i8x16 {
    a.lt(b)
 }

 /// Compare packed 16-bit integers in `a` and `b` for less-than.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_cmplt_epi16(a: i16x8, b: i16x8) -> i16x8 {
+#[cfg_attr(test, assert_instr(pcmpgtw))]
+pub unsafe fn _mm_cmplt_epi16(a: i16x8, b: i16x8) -> i16x8 {
    a.lt(b)
 }

 /// Compare packed 32-bit integers in `a` and `b` for less-than.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_cmplt_epi32(a: i32x4, b: i32x4) -> i32x4 {
+#[cfg_attr(test, assert_instr(pcmpgtd))]
+pub unsafe fn _mm_cmplt_epi32(a: i32x4, b: i32x4) -> i32x4 {
    a.lt(b)
 }

@@ -588,31 +672,37 @@ pub fn _mm_cmplt_epi32(a: i32x4, b: i32x4) -> i32x4 {
 /// double-precision (64-bit) floating-point elements.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_cvtepi32_pd(a: i32x4) -> f64x2  {
-    unsafe { simd_cast::<i32x2, f64x2>(simd_shuffle2(a, a, [0, 1])) }
+#[cfg_attr(test, assert_instr(cvtdq2pd))]
+pub unsafe fn _mm_cvtepi32_pd(a: i32x4) -> f64x2  {
+    simd_cast::<i32x2, f64x2>(simd_shuffle2(a, a, [0, 1]))
 }

 /// Return `a` with its lower element replaced by `b` after converting it to
 /// an `f64`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_cvtsi32_sd(a: f64x2, b: i32) -> f64x2 {
+#[cfg_attr(test, assert_instr(cvtsi2sd))]
+pub unsafe fn _mm_cvtsi32_sd(a: f64x2, b: i32) -> f64x2 {
    a.replace(0, b as f64)
 }

 /// Return `a` with its lower element replaced by `b` after converting it to
 /// an `f64`.
+#[cfg(target_arch = "x86_64")]
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_cvtsi64_sd(a: f64x2, b: i64) -> f64x2 {
+#[cfg_attr(test, assert_instr(cvtsi2sd))]
+pub unsafe fn _mm_cvtsi64_sd(a: f64x2, b: i64) -> f64x2 {
    a.replace(0, b as f64)
 }

 /// Return `a` with its lower element replaced by `b` after converting it to
 /// an `f64`.
+#[cfg(target_arch = "x86_64")]
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_cvtsi64x_sd(a: f64x2, b: i64) -> f64x2 {
+#[cfg_attr(test, assert_instr(cvtsi2sd))]
+pub unsafe fn _mm_cvtsi64x_sd(a: f64x2, b: i64) -> f64x2 {
    _mm_cvtsi64_sd(a, b)
 }

@@ -620,52 +710,63 @@ pub fn _mm_cvtsi64x_sd(a: f64x2, b: i64) -> f64x2 {
 /// floating-point elements.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_cvtepi32_ps(a: i32x4) -> f32x4 {
-    unsafe { cvtdq2ps(a) }
+#[cfg_attr(test, assert_instr(cvtdq2ps))]
+pub unsafe fn _mm_cvtepi32_ps(a: i32x4) -> f32x4 {
+    cvtdq2ps(a)
 }

 /// Return a vector whose lowest element is `a` and all higher elements are
 /// `0`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_cvtsi32_si128(a: i32) -> i32x4 {
+// no particular instruction to test
+pub unsafe fn _mm_cvtsi32_si128(a: i32) -> i32x4 {
    i32x4::new(a, 0, 0, 0)
 }

 /// Return a vector whose lowest element is `a` and all higher elements are
 /// `0`.
+#[cfg(target_arch = "x86_64")]
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_cvtsi64_si128(a: i64) -> i64x2 {
+// no particular instruction to test
+pub unsafe fn _mm_cvtsi64_si128(a: i64) -> i64x2 {
    i64x2::new(a, 0)
 }

 /// Return a vector whose lowest element is `a` and all higher elements are
 /// `0`.
+#[cfg(target_arch = "x86_64")]
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_cvtsi64x_si128(a: i64) -> i64x2 {
+// no particular instruction to test
+pub unsafe fn _mm_cvtsi64x_si128(a: i64) -> i64x2 {
    _mm_cvtsi64_si128(a)
 }

 /// Return the lowest element of `a`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_cvtsi128_si32(a: i32x4) -> i32 {
+// no particular instruction to test
+pub unsafe fn _mm_cvtsi128_si32(a: i32x4) -> i32 {
    a.extract(0)
 }

 /// Return the lowest element of `a`.
+#[cfg(target_arch = "x86_64")]
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_cvtsi128_si64(a: i64x2) -> i64 {
+// no particular instruction to test
+pub unsafe fn _mm_cvtsi128_si64(a: i64x2) -> i64 {
    a.extract(0)
 }

 /// Return the lowest element of `a`.
+#[cfg(target_arch = "x86_64")]
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_cvtsi128_si64x(a: i64x2) -> i64 {
+// no particular instruction to test
+pub unsafe fn _mm_cvtsi128_si64x(a: i64x2) -> i64 {
    _mm_cvtsi128_si64(a)
 }

@@ -673,21 +774,24 @@ pub fn _mm_cvtsi128_si64x(a: i64x2) -> i64 {
 /// lowest.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_set_epi64x(e1: i64, e0: i64) -> i64x2 {
+// no particular instruction to test
+pub unsafe fn _mm_set_epi64x(e1: i64, e0: i64) -> i64x2 {
    i64x2::new(e0, e1)
 }

 /// Set packed 32-bit integers with the supplied values.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> i32x4 {
+// no particular instruction to test
+pub unsafe fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> i32x4 {
    i32x4::new(e0, e1, e2, e3)
 }

 /// Set packed 16-bit integers with the supplied values.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_set_epi16(
+// no particular instruction to test
+pub unsafe fn _mm_set_epi16(
    e7: i16, e6: i16, e5: i16, e4: i16, e3: i16, e2: i16, e1: i16, e0: i16,
 ) -> i16x8 {
    i16x8::new(e0, e1, e2, e3, e4, e5, e6, e7)
@@ -696,7 +800,8 @@ pub fn _mm_set_epi16(
 /// Set packed 8-bit integers with the supplied values.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_set_epi8(
+// no particular instruction to test
+pub unsafe fn _mm_set_epi8(
    e15: i8, e14: i8, e13: i8, e12: i8, e11: i8, e10: i8, e9: i8, e8: i8,
    e7: i8, e6: i8, e5: i8, e4: i8, e3: i8, e2: i8, e1: i8, e0: i8,
 ) -> i8x16 {
@@ -708,42 +813,48 @@ pub fn _mm_set_epi8(
 /// Broadcast 64-bit integer `a` to all elements.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_set1_epi64x(a: i64) -> i64x2 {
+// no particular instruction to test
+pub unsafe fn _mm_set1_epi64x(a: i64) -> i64x2 {
    i64x2::splat(a)
 }

 /// Broadcast 32-bit integer `a` to all elements.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_set1_epi32(a: i32) -> i32x4 {
+// no particular instruction to test
+pub unsafe fn _mm_set1_epi32(a: i32) -> i32x4 {
    i32x4::splat(a)
 }

 /// Broadcast 16-bit integer `a` to all elements.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_set1_epi16(a: i16) -> i16x8 {
+// no particular instruction to test
+pub unsafe fn _mm_set1_epi16(a: i16) -> i16x8 {
    i16x8::splat(a)
 }

 /// Broadcast 8-bit integer `a` to all elements.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_set1_epi8(a: i8) -> i8x16 {
+// no particular instruction to test
+pub unsafe fn _mm_set1_epi8(a: i8) -> i8x16 {
    i8x16::splat(a)
 }

 /// Set packed 32-bit integers with the supplied values in reverse order.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> i32x4 {
+// no particular instruction to test
+pub unsafe fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> i32x4 {
    i32x4::new(e3, e2, e1, e0)
 }

 /// Set packed 16-bit integers with the supplied values in reverse order.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_setr_epi16(
+// no particular instruction to test
+pub unsafe fn _mm_setr_epi16(
    e7: i16, e6: i16, e5: i16, e4: i16, e3: i16, e2: i16, e1: i16, e0: i16,
 ) -> i16x8 {
    i16x8::new(e7, e6, e5, e4, e3, e2, e1, e0)
@@ -752,7 +863,8 @@ pub fn _mm_setr_epi16(
 /// Set packed 8-bit integers with the supplied values in reverse order.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_setr_epi8(
+// no particular instruction to test
+pub unsafe fn _mm_setr_epi8(
    e15: i8, e14: i8, e13: i8, e12: i8, e11: i8, e10: i8, e9: i8, e8: i8,
    e7: i8, e6: i8, e5: i8, e4: i8, e3: i8, e2: i8, e1: i8, e0: i8,
 ) -> i8x16 {
@@ -764,13 +876,15 @@ pub fn _mm_setr_epi8(
 /// Returns a vector with all elements set to zero.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_setzero_si128() -> __m128i {
+#[cfg_attr(test, assert_instr(xorps))]
+pub unsafe fn _mm_setzero_si128() -> __m128i {
    __m128i::splat(0)
 }

 /// Load 64-bit integer from memory into first element of returned vector.
 #[inline(always)]
 #[target_feature = "+sse2"]
+// no particular instruction to test
 pub unsafe fn _mm_loadl_epi64(mem_addr: *const i64x2) -> i64x2 {
    i64x2::new((*mem_addr).extract(0), 0)
 }
@@ -780,6 +894,7 @@ pub unsafe fn _mm_loadl_epi64(mem_addr: *const i64x2) -> i64x2 {
 /// `mem_addr` must be aligned on a 16-byte boundary.
 #[inline(always)]
 #[target_feature = "+sse2"]
+#[cfg_attr(test, assert_instr(movaps))]
 pub unsafe fn _mm_load_si128(mem_addr: *const __m128i) -> __m128i {
    *mem_addr
 }
@@ -789,6 +904,7 @@ pub unsafe fn _mm_load_si128(mem_addr: *const __m128i) -> __m128i {
 /// `mem_addr` does not need to be aligned on any particular boundary.
 #[inline(always)]
 #[target_feature = "+sse2"]
+#[cfg_attr(test, assert_instr(movups))]
 pub unsafe fn _mm_loadu_si128(mem_addr: *const __m128i) -> __m128i {
    let mut dst = mem::uninitialized();
    ptr::copy_nonoverlapping(
@@ -808,6 +924,7 @@ pub unsafe fn _mm_loadu_si128(mem_addr: *const __m128i) -> __m128i {
 /// to be aligned on any particular boundary.
 #[inline(always)]
 #[target_feature = "+sse2"]
+#[cfg_attr(test, assert_instr(maskmovdqu))]
 pub unsafe fn _mm_maskmoveu_si128(a: i8x16, mask: i8x16, mem_addr: *mut i8) {
    maskmovdqu(a, mask, mem_addr)
 }
@@ -817,6 +934,7 @@ pub unsafe fn _mm_maskmoveu_si128(a: i8x16, mask: i8x16, mem_addr: *mut i8) {
 /// `mem_addr` must be aligned on a 16-byte boundary.
 #[inline(always)]
 #[target_feature = "+sse2"]
+#[cfg_attr(test, assert_instr(movaps))]
 pub unsafe fn _mm_store_si128(mem_addr: *mut __m128i, a: __m128i) {
    *mem_addr = a;
 }
@@ -826,6 +944,7 @@ pub unsafe fn _mm_store_si128(mem_addr: *mut __m128i, a: __m128i) {
 /// `mem_addr` does not need to be aligned on any particular boundary.
 #[inline(always)]
 #[target_feature = "+sse2"]
+#[cfg_attr(test, assert_instr(movups))]
 pub unsafe fn _mm_storeu_si128(mem_addr: *mut __m128i, a: __m128i) {
    ptr::copy_nonoverlapping(
        &a as *const _ as *const u8,
@@ -838,6 +957,7 @@ pub unsafe fn _mm_storeu_si128(mem_addr: *mut __m128i, a: __m128i) {
 /// `mem_addr` does not need to be aligned on any particular boundary.
 #[inline(always)]
 #[target_feature = "+sse2"]
+// no particular instruction to test
 pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) {
    ptr::copy_nonoverlapping(
        &a as *const _ as *const u8, mem_addr as *mut u8, 8);
@@ -847,59 +967,78 @@ pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) {
 /// element is zero.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_move_epi64(a: i64x2) -> i64x2 {
-    a.replace(1, 0)
+// no particular instruction to test
+pub unsafe fn _mm_move_epi64(a: i64x2) -> i64x2 {
+    simd_shuffle2(a, i64x2::splat(0), [0, 2])
 }

 /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers
 /// using signed saturation.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_packs_epi16(a: i16x8, b: i16x8) -> i8x16 {
-    unsafe { packsswb(a, b) }
+#[cfg_attr(test, assert_instr(packsswb))]
+pub unsafe fn _mm_packs_epi16(a: i16x8, b: i16x8) -> i8x16 {
+    packsswb(a, b)
 }

 /// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers
 /// using signed saturation.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_packs_epi32(a: i32x4, b: i32x4) -> i16x8 {
-    unsafe { packssdw(a, b) }
+#[cfg_attr(test, assert_instr(packssdw))]
+pub unsafe fn _mm_packs_epi32(a: i32x4, b: i32x4) -> i16x8 {
+    packssdw(a, b)
 }

 /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers
 /// using unsigned saturation.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_packus_epi16(a: i16x8, b: i16x8) -> u8x16 {
-    unsafe { packuswb(a, b) }
+#[cfg_attr(test, assert_instr(packuswb))]
+pub unsafe fn _mm_packus_epi16(a: i16x8, b: i16x8) -> u8x16 {
+    packuswb(a, b)
 }

 /// Return the `imm8` element of `a`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_extract_epi16(a: i16x8, imm8: i32) -> i32 {
+pub unsafe fn _mm_extract_epi16(a: i16x8, imm8: i32) -> i32 {
    a.extract(imm8 as u32 & 0b111) as i32
 }

+#[cfg(test)]
+#[target_feature = "+sse2"]
+#[cfg_attr(test, assert_instr(pextrw))]
+fn _test_mm_extract_epi16(a: i16x8) -> i32 {
+    unsafe { _mm_extract_epi16(a, 9) }
+}
+
 /// Return a new vector where the `imm8` element of `a` is replaced with `i`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_insert_epi16(a: i16x8, i: i32, imm8: i32) -> i16x8 {
+pub unsafe fn _mm_insert_epi16(a: i16x8, i: i32, imm8: i32) -> i16x8 {
    a.replace(imm8 as u32 & 0b111, i as i16)
 }

+#[cfg(test)]
+#[target_feature = "+sse2"]
+#[cfg_attr(test, assert_instr(pinsrw))]
+fn _test_mm_insert_epi16(a: i16x8, i: i32) -> i16x8 {
+    unsafe { _mm_insert_epi16(a, i, 9) }
+}
+
 /// Return a mask of the most significant bit of each element in `a`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_movemask_epi8(a: i8x16) -> i32 {
-    unsafe { pmovmskb(a) }
+#[cfg_attr(test, assert_instr(pmovmskb))]
+pub unsafe fn _mm_movemask_epi8(a: i8x16) -> i32 {
+    pmovmskb(a)
 }

 /// Shuffle 32-bit integers in `a` using the control in `imm8`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_shuffle_epi32(a: i32x4, imm8: i32) -> i32x4 {
+pub unsafe fn _mm_shuffle_epi32(a: i32x4, imm8: i32) -> i32x4 {
    // simd_shuffleX requires that its selector parameter be made up of
    // constant values, but we can't enforce that here. In spirit, we need
    // to write a `match` on all possible values of a byte, and for each value,
@@ -911,9 +1050,7 @@ pub fn _mm_shuffle_epi32(a: i32x4, imm8: i32) -> i32x4 {

    macro_rules! shuffle_done {
        ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
-            unsafe {
-                simd_shuffle4(a, a, [$x01, $x23, $x45, $x67])
-            }
+            simd_shuffle4(a, a, [$x01, $x23, $x45, $x67])
        }
    }
    macro_rules! shuffle_x67 {
@@ -954,6 +1091,13 @@ macro_rules! shuffle_x23 {
    }
 }

+#[cfg(test)]
+#[target_feature = "+sse2"]
+#[cfg_attr(test, assert_instr(pshufd))]
+fn _test_mm_shuffle_epi32(a: i32x4) -> i32x4 {
+    unsafe { _mm_shuffle_epi32(a, 9) }
+}
+
 /// Shuffle 16-bit integers in the high 64 bits of `a` using the control in
 /// `imm8`.
 ///
@@ -961,18 +1105,16 @@ macro_rules! shuffle_x23 {
 /// bits being copied from from `a`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_shufflehi_epi16(a: i16x8, imm8: i32) -> i16x8 {
+pub unsafe fn _mm_shufflehi_epi16(a: i16x8, imm8: i32) -> i16x8 {
    // See _mm_shuffle_epi32.
    let imm8 = (imm8 & 0xFF) as u8;
    const fn add4(x: u32) -> u32 { x + 4 }

    macro_rules! shuffle_done {
        ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
-            unsafe {
-                simd_shuffle8(a, a, [
-                    0, 1, 2, 3, add4($x01), add4($x23), add4($x45), add4($x67),
-                ])
-            }
+            simd_shuffle8(a, a, [
+                0, 1, 2, 3, add4($x01), add4($x23), add4($x45), add4($x67),
+            ])
        }
    }
    macro_rules! shuffle_x67 {
@@ -1013,6 +1155,13 @@ macro_rules! shuffle_x23 {
    }
 }

+#[cfg(test)]
+#[target_feature = "+sse2"]
+#[cfg_attr(test, assert_instr(pshufhw))]
+fn _test_mm_shufflehi_epi16(a: i16x8) -> i16x8 {
+    unsafe { _mm_shufflehi_epi16(a, 9) }
+}
+
 /// Shuffle 16-bit integers in the low 64 bits of `a` using the control in
 /// `imm8`.
 ///
@@ -1020,15 +1169,13 @@ macro_rules! shuffle_x23 {
 /// bits being copied from from `a`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_shufflelo_epi16(a: i16x8, imm8: i32) -> i16x8 {
+pub unsafe fn _mm_shufflelo_epi16(a: i16x8, imm8: i32) -> i16x8 {
    // See _mm_shuffle_epi32.
    let imm8 = (imm8 & 0xFF) as u8;

    macro_rules! shuffle_done {
        ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
-            unsafe {
-                simd_shuffle8(a, a, [$x01, $x23, $x45, $x67, 4, 5, 6, 7])
-            }
+            simd_shuffle8(a, a, [$x01, $x23, $x45, $x67, 4, 5, 6, 7])
        }
    }
    macro_rules! shuffle_x67 {
@@ -1069,77 +1216,89 @@ macro_rules! shuffle_x23 {
    }
 }

+#[cfg(test)]
+#[target_feature = "+sse2"]
+#[cfg_attr(test, assert_instr(pshuflw))]
+fn _test_mm_shufflelo_epi16(a: i16x8) -> i16x8 {
+    unsafe { _mm_shufflelo_epi16(a, 9) }
+}
+
 /// Unpack and interleave 8-bit integers from the high half of `a` and `b`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_unpackhi_epi8(a: i8x16, b: i8x16) -> i8x16 {
-    unsafe {
-        simd_shuffle16(a, b, [
-            8, 24, 9, 25, 10, 26, 11, 27,
-            12, 28, 13, 29, 14, 30, 15, 31,
-        ])
-    }
+#[cfg_attr(test, assert_instr(punpckhbw))]
+pub unsafe fn _mm_unpackhi_epi8(a: i8x16, b: i8x16) -> i8x16 {
+    simd_shuffle16(a, b, [
+        8, 24, 9, 25, 10, 26, 11, 27,
+        12, 28, 13, 29, 14, 30, 15, 31,
+    ])
 }

 /// Unpack and interleave 16-bit integers from the high half of `a` and `b`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_unpackhi_epi16(a: i16x8, b: i16x8) -> i16x8 {
-    unsafe { simd_shuffle8(a, b, [4, 12, 5, 13, 6, 14, 7, 15]) }
+#[cfg_attr(test, assert_instr(punpckhwd))]
+pub unsafe fn _mm_unpackhi_epi16(a: i16x8, b: i16x8) -> i16x8 {
+    simd_shuffle8(a, b, [4, 12, 5, 13, 6, 14, 7, 15])
 }

 /// Unpack and interleave 32-bit integers from the high half of `a` and `b`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_unpackhi_epi32(a: i32x4, b: i32x4) -> i32x4 {
-    unsafe { simd_shuffle4(a, b, [2, 6, 3, 7]) }
+#[cfg_attr(test, assert_instr(punpckhdq))]
+pub unsafe fn _mm_unpackhi_epi32(a: i32x4, b: i32x4) -> i32x4 {
+    simd_shuffle4(a, b, [2, 6, 3, 7])
 }

 /// Unpack and interleave 64-bit integers from the high half of `a` and `b`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_unpackhi_epi64(a: i64x2, b: i64x2) -> i64x2 {
-    unsafe { simd_shuffle2(a, b, [1, 3]) }
+#[cfg_attr(test, assert_instr(punpckhqdq))]
+pub unsafe fn _mm_unpackhi_epi64(a: i64x2, b: i64x2) -> i64x2 {
+    simd_shuffle2(a, b, [1, 3])
 }

 /// Unpack and interleave 8-bit integers from the low half of `a` and `b`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_unpacklo_epi8(a: i8x16, b: i8x16) -> i8x16 {
-    unsafe {
-        simd_shuffle16(a, b, [
-            0, 16, 1, 17, 2, 18, 3, 19,
-            4, 20, 5, 21, 6, 22, 7, 23,
-        ])
-    }
+#[cfg_attr(test, assert_instr(punpcklbw))]
+pub unsafe fn _mm_unpacklo_epi8(a: i8x16, b: i8x16) -> i8x16 {
+    simd_shuffle16(a, b, [
+        0, 16, 1, 17, 2, 18, 3, 19,
+        4, 20, 5, 21, 6, 22, 7, 23,
+    ])
 }

 /// Unpack and interleave 16-bit integers from the low half of `a` and `b`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_unpacklo_epi16(a: i16x8, b: i16x8) -> i16x8 {
-    unsafe { simd_shuffle8(a, b, [0, 8, 1, 9, 2, 10, 3, 11]) }
+#[cfg_attr(test, assert_instr(punpcklwd))]
+pub unsafe fn _mm_unpacklo_epi16(a: i16x8, b: i16x8) -> i16x8 {
+    simd_shuffle8(a, b, [0, 8, 1, 9, 2, 10, 3, 11])
 }

 /// Unpack and interleave 32-bit integers from the low half of `a` and `b`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_unpacklo_epi32(a: i32x4, b: i32x4) -> i32x4 {
-    unsafe { simd_shuffle4(a, b, [0, 4, 1, 5]) }
+#[cfg_attr(test, assert_instr(punpckldq))]
+pub unsafe fn _mm_unpacklo_epi32(a: i32x4, b: i32x4) -> i32x4 {
+    simd_shuffle4(a, b, [0, 4, 1, 5])
 }

 /// Unpack and interleave 64-bit integers from the low half of `a` and `b`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_unpacklo_epi64(a: i64x2, b: i64x2) -> i64x2 {
-    unsafe { simd_shuffle2(a, b, [0, 2]) }
+#[cfg_attr(test, assert_instr(punpcklqdq))]
+pub unsafe fn _mm_unpacklo_epi64(a: i64x2, b: i64x2) -> i64x2 {
+    simd_shuffle2(a, b, [0, 2])
 }

 /// Return a new vector with the low element of `a` replaced by the sum of the
 /// low elements of `a` and `b`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_add_sd(a: f64x2, b: f64x2) -> f64x2 {
+#[cfg_attr(test, assert_instr(addsd))]
+pub unsafe fn _mm_add_sd(a: f64x2, b: f64x2) -> f64x2 {
    a.replace(0, a.extract(0) + b.extract(0))
 }

@@ -1147,7 +1306,8 @@ pub fn _mm_add_sd(a: f64x2, b: f64x2) -> f64x2 {
 /// `b`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_add_pd(a: f64x2, b: f64x2) -> f64x2 {
+#[cfg_attr(test, assert_instr(addpd))]
+pub unsafe fn _mm_add_pd(a: f64x2, b: f64x2) -> f64x2 {
    a + b
 }

@@ -1155,7 +1315,8 @@ pub fn _mm_add_pd(a: f64x2, b: f64x2) -> f64x2 {
 /// diving the lower element of `a` by the lower element of `b`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_div_sd(a: f64x2, b: f64x2) -> f64x2 {
+#[cfg_attr(test, assert_instr(divsd))]
+pub unsafe fn _mm_div_sd(a: f64x2, b: f64x2) -> f64x2 {
    a.replace(0, a.extract(0) / b.extract(0))
 }

@@ -1163,7 +1324,8 @@ pub fn _mm_div_sd(a: f64x2, b: f64x2) -> f64x2 {
 /// packed elements in `b`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_div_pd(a: f64x2, b: f64x2) -> f64x2 {
+#[cfg_attr(test, assert_instr(divpd))]
+pub unsafe fn _mm_div_pd(a: f64x2, b: f64x2) -> f64x2 {
    a / b
 }

@@ -1171,39 +1333,44 @@ pub fn _mm_div_pd(a: f64x2, b: f64x2) -> f64x2 {
 /// of the lower elements of `a` and `b`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_max_sd(a: f64x2, b: f64x2) -> f64x2 {
-    unsafe { maxsd(a, b) }
+#[cfg_attr(test, assert_instr(maxsd))]
+pub unsafe fn _mm_max_sd(a: f64x2, b: f64x2) -> f64x2 {
+    maxsd(a, b)
 }

 /// Return a new vector with the maximum values from corresponding elements in
 /// `a` and `b`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_max_pd(a: f64x2, b: f64x2) -> f64x2 {
-    unsafe { maxpd(a, b) }
+#[cfg_attr(test, assert_instr(maxpd))]
+pub unsafe fn _mm_max_pd(a: f64x2, b: f64x2) -> f64x2 {
+    maxpd(a, b)
 }

 /// Return a new vector with the low element of `a` replaced by the minimum
 /// of the lower elements of `a` and `b`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_min_sd(a: f64x2, b: f64x2) -> f64x2 {
-    unsafe { minsd(a, b) }
+#[cfg_attr(test, assert_instr(minsd))]
+pub unsafe fn _mm_min_sd(a: f64x2, b: f64x2) -> f64x2 {
+    minsd(a, b)
 }

 /// Return a new vector with the minimum values from corresponding elements in
 /// `a` and `b`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_min_pd(a: f64x2, b: f64x2) -> f64x2 {
-    unsafe { minpd(a, b) }
+#[cfg_attr(test, assert_instr(minpd))]
+pub unsafe fn _mm_min_pd(a: f64x2, b: f64x2) -> f64x2 {
+    minpd(a, b)
 }

 /// Return a new vector with the low element of `a` replaced by multiplying the
 /// low elements of `a` and `b`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_mul_sd(a: f64x2, b: f64x2) -> f64x2 {
+#[cfg_attr(test, assert_instr(mulsd))]
+pub unsafe fn _mm_mul_sd(a: f64x2, b: f64x2) -> f64x2 {
    a.replace(0, a.extract(0) * b.extract(0))
 }

@@ -1211,7 +1378,8 @@ pub fn _mm_mul_sd(a: f64x2, b: f64x2) -> f64x2 {
 /// and `b`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_mul_pd(a: f64x2, b: f64x2) -> f64x2 {
+#[cfg_attr(test, assert_instr(mulpd))]
+pub unsafe fn _mm_mul_pd(a: f64x2, b: f64x2) -> f64x2 {
    a * b
 }

@@ -1219,22 +1387,25 @@ pub fn _mm_mul_pd(a: f64x2, b: f64x2) -> f64x2 {
 /// root of the lower element `b`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_sqrt_sd(a: f64x2, b: f64x2) -> f64x2 {
-    a.replace(0, unsafe { sqrtsd(b).extract(0) })
+#[cfg_attr(test, assert_instr(sqrtsd))]
+pub unsafe fn _mm_sqrt_sd(a: f64x2, b: f64x2) -> f64x2 {
+    a.replace(0, sqrtsd(b).extract(0))
 }

 /// Return a new vector with the square root of each of the values in `a`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_sqrt_pd(a: f64x2) -> f64x2 {
-    unsafe { sqrtpd(a) }
+#[cfg_attr(test, assert_instr(sqrtpd))]
+pub unsafe fn _mm_sqrt_pd(a: f64x2) -> f64x2 {
+    sqrtpd(a)
 }

 /// Return a new vector with the low element of `a` replaced by subtracting the
 /// low element by `b` from the low element of `a`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_sub_sd(a: f64x2, b: f64x2) -> f64x2 {
+#[cfg_attr(test, assert_instr(subsd))]
+pub unsafe fn _mm_sub_sd(a: f64x2, b: f64x2) -> f64x2 {
    a.replace(0, a.extract(0) - b.extract(0))
 }

@@ -1242,7 +1413,8 @@ pub fn _mm_sub_sd(a: f64x2, b: f64x2) -> f64x2 {
 /// from `a`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_sub_pd(a: f64x2, b: f64x2) -> f64x2 {
+#[cfg_attr(test, assert_instr(subpd))]
+pub unsafe fn _mm_sub_pd(a: f64x2, b: f64x2) -> f64x2 {
    a - b
 }

@@ -1250,76 +1422,76 @@ pub fn _mm_sub_pd(a: f64x2, b: f64x2) -> f64x2 {
 /// elements in `a` and `b`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_and_pd(a: f64x2, b: f64x2) -> f64x2 {
-    unsafe {
-        let a: i64x2 = mem::transmute(a);
-        let b: i64x2 = mem::transmute(b);
-        mem::transmute(a & b)
-    }
+#[cfg_attr(test, assert_instr(andps))]
+pub unsafe fn _mm_and_pd(a: f64x2, b: f64x2) -> f64x2 {
+    let a: i64x2 = mem::transmute(a);
+    let b: i64x2 = mem::transmute(b);
+    mem::transmute(a & b)
 }

 /// Compute the bitwise NOT of `a` and then AND with `b`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_andnot_pd(a: f64x2, b: f64x2) -> f64x2 {
-    unsafe {
-        let a: i64x2 = mem::transmute(a);
-        let b: i64x2 = mem::transmute(b);
-        mem::transmute((!a) & b)
-    }
+#[cfg_attr(test, assert_instr(andnps))]
+pub unsafe fn _mm_andnot_pd(a: f64x2, b: f64x2) -> f64x2 {
+    let a: i64x2 = mem::transmute(a);
+    let b: i64x2 = mem::transmute(b);
+    mem::transmute((!a) & b)
 }

 /// Compute the bitwise OR of `a` and `b`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_or_pd(a: f64x2, b: f64x2) -> f64x2 {
-    unsafe {
-        let a: i64x2 = mem::transmute(a);
-        let b: i64x2 = mem::transmute(b);
-        mem::transmute(a | b)
-    }
+#[cfg_attr(test, assert_instr(orps))]
+pub unsafe fn _mm_or_pd(a: f64x2, b: f64x2) -> f64x2 {
+    let a: i64x2 = mem::transmute(a);
+    let b: i64x2 = mem::transmute(b);
+    mem::transmute(a | b)
 }

 /// Compute the bitwise OR of `a` and `b`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_xor_pd(a: f64x2, b: f64x2) -> f64x2 {
-    unsafe {
-        let a: i64x2 = mem::transmute(a);
-        let b: i64x2 = mem::transmute(b);
-        mem::transmute(a ^ b)
-    }
+#[cfg_attr(test, assert_instr(xorps))]
+pub unsafe fn _mm_xor_pd(a: f64x2, b: f64x2) -> f64x2 {
+    let a: i64x2 = mem::transmute(a);
+    let b: i64x2 = mem::transmute(b);
+    mem::transmute(a ^ b)
 }

 /// Return a new vector with the low element of `a` replaced by the equality
 /// comparison of the lower elements of `a` and `b`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_cmpeq_sd(a: f64x2, b: f64x2) -> f64x2 {
-    unsafe { cmpsd(a, b, 0) }
+#[cfg_attr(test, assert_instr(cmpeqsd))]
+pub unsafe fn _mm_cmpeq_sd(a: f64x2, b: f64x2) -> f64x2 {
+    cmpsd(a, b, 0)
 }

 /// Return a new vector with the low element of `a` replaced by the less-than
 /// comparison of the lower elements of `a` and `b`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_cmplt_sd(a: f64x2, b: f64x2) -> f64x2 {
-    unsafe { cmpsd(a, b, 1) }
+#[cfg_attr(test, assert_instr(cmpltsd))]
+pub unsafe fn _mm_cmplt_sd(a: f64x2, b: f64x2) -> f64x2 {
+    cmpsd(a, b, 1)
 }

 /// Return a new vector with the low element of `a` replaced by the
 /// less-than-or-equal comparison of the lower elements of `a` and `b`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_cmple_sd(a: f64x2, b: f64x2) -> f64x2 {
-    unsafe { cmpsd(a, b, 2) }
+#[cfg_attr(test, assert_instr(cmplesd))]
+pub unsafe fn _mm_cmple_sd(a: f64x2, b: f64x2) -> f64x2 {
+    cmpsd(a, b, 2)
 }

 /// Return a new vector with the low element of `a` replaced by the
 /// greater-than comparison of the lower elements of `a` and `b`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_cmpgt_sd(a: f64x2, b: f64x2) -> f64x2 {
+#[cfg_attr(test, assert_instr(cmpltsd))]
+pub unsafe fn _mm_cmpgt_sd(a: f64x2, b: f64x2) -> f64x2 {
    _mm_cmplt_sd(b, a).replace(1, a.extract(1))
 }

@@ -1327,7 +1499,8 @@ pub fn _mm_cmpgt_sd(a: f64x2, b: f64x2) -> f64x2 {
 /// greater-than-or-equal comparison of the lower elements of `a` and `b`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_cmpge_sd(a: f64x2, b: f64x2) -> f64x2 {
+#[cfg_attr(test, assert_instr(cmplesd))]
+pub unsafe fn _mm_cmpge_sd(a: f64x2, b: f64x2) -> f64x2 {
    _mm_cmple_sd(b, a).replace(1, a.extract(1))
 }

@@ -1337,8 +1510,9 @@ pub fn _mm_cmpge_sd(a: f64x2, b: f64x2) -> f64x2 {
 /// otherwise.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_cmpord_sd(a: f64x2, b: f64x2) -> f64x2 {
-    unsafe { cmpsd(a, b, 7) }
+#[cfg_attr(test, assert_instr(cmpordsd))]
+pub unsafe fn _mm_cmpord_sd(a: f64x2, b: f64x2) -> f64x2 {
+    cmpsd(a, b, 7)
 }

 /// Return a new vector with the low element of `a` replaced by the result of
@@ -1346,39 +1520,44 @@ pub fn _mm_cmpord_sd(a: f64x2, b: f64x2) -> f64x2 {
 /// equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0` otherwise.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_cmpunord_sd(a: f64x2, b: f64x2) -> f64x2 {
-    unsafe { cmpsd(a, b, 3) }
+#[cfg_attr(test, assert_instr(cmpunordsd))]
+pub unsafe fn _mm_cmpunord_sd(a: f64x2, b: f64x2) -> f64x2 {
+    cmpsd(a, b, 3)
 }

 /// Return a new vector with the low element of `a` replaced by the not-equal
 /// comparison of the lower elements of `a` and `b`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_cmpneq_sd(a: f64x2, b: f64x2) -> f64x2 {
-    unsafe { cmpsd(a, b, 4) }
+#[cfg_attr(test, assert_instr(cmpneqsd))]
+pub unsafe fn _mm_cmpneq_sd(a: f64x2, b: f64x2) -> f64x2 {
+    cmpsd(a, b, 4)
 }

 /// Return a new vector with the low element of `a` replaced by the
 /// not-less-than comparison of the lower elements of `a` and `b`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_cmpnlt_sd(a: f64x2, b: f64x2) -> f64x2 {
-    unsafe { cmpsd(a, b, 5) }
+#[cfg_attr(test, assert_instr(cmpnltsd))]
+pub unsafe fn _mm_cmpnlt_sd(a: f64x2, b: f64x2) -> f64x2 {
+    cmpsd(a, b, 5)
 }

 /// Return a new vector with the low element of `a` replaced by the
 /// not-less-than-or-equal comparison of the lower elements of `a` and `b`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_cmpnle_sd(a: f64x2, b: f64x2) -> f64x2 {
-    unsafe { cmpsd(a, b, 6) }
+#[cfg_attr(test, assert_instr(cmpnlesd))]
+pub unsafe fn _mm_cmpnle_sd(a: f64x2, b: f64x2) -> f64x2 {
+    cmpsd(a, b, 6)
 }

 /// Return a new vector with the low element of `a` replaced by the
 /// not-greater-than comparison of the lower elements of `a` and `b`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_cmpngt_sd(a: f64x2, b: f64x2) -> f64x2 {
+#[cfg_attr(test, assert_instr(cmpnltsd))]
+pub unsafe fn _mm_cmpngt_sd(a: f64x2, b: f64x2) -> f64x2 {
    _mm_cmpnlt_sd(b, a).replace(1, a.extract(1))
 }

@@ -1386,84 +1565,96 @@ pub fn _mm_cmpngt_sd(a: f64x2, b: f64x2) -> f64x2 {
 /// not-greater-than-or-equal comparison of the lower elements of `a` and `b`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_cmpnge_sd(a: f64x2, b: f64x2) -> f64x2 {
+#[cfg_attr(test, assert_instr(cmpnlesd))]
+pub unsafe fn _mm_cmpnge_sd(a: f64x2, b: f64x2) -> f64x2 {
    _mm_cmpnle_sd(b, a).replace(1, a.extract(1))
 }

 /// Compare corresponding elements in `a` and `b` for equality.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_cmpeq_pd(a: f64x2, b: f64x2) -> f64x2 {
-    unsafe { cmppd(a, b, 0) }
+#[cfg_attr(test, assert_instr(cmpeqpd))]
+pub unsafe fn _mm_cmpeq_pd(a: f64x2, b: f64x2) -> f64x2 {
+    cmppd(a, b, 0)
 }

 /// Compare corresponding elements in `a` and `b` for less-than.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_cmplt_pd(a: f64x2, b: f64x2) -> f64x2 {
-    unsafe { cmppd(a, b, 1) }
+#[cfg_attr(test, assert_instr(cmpltpd))]
+pub unsafe fn _mm_cmplt_pd(a: f64x2, b: f64x2) -> f64x2 {
+    cmppd(a, b, 1)
 }

 /// Compare corresponding elements in `a` and `b` for less-than-or-equal
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_cmple_pd(a: f64x2, b: f64x2) -> f64x2 {
-    unsafe { cmppd(a, b, 2) }
+#[cfg_attr(test, assert_instr(cmplepd))]
+pub unsafe fn _mm_cmple_pd(a: f64x2, b: f64x2) -> f64x2 {
+    cmppd(a, b, 2)
 }

 /// Compare corresponding elements in `a` and `b` for greater-than.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_cmpgt_pd(a: f64x2, b: f64x2) -> f64x2 {
+#[cfg_attr(test, assert_instr(cmpltpd))]
+pub unsafe fn _mm_cmpgt_pd(a: f64x2, b: f64x2) -> f64x2 {
    _mm_cmplt_pd(b, a)
 }

 /// Compare corresponding elements in `a` and `b` for greater-than-or-equal.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_cmpge_pd(a: f64x2, b: f64x2) -> f64x2 {
+#[cfg_attr(test, assert_instr(cmplepd))]
+pub unsafe fn _mm_cmpge_pd(a: f64x2, b: f64x2) -> f64x2 {
    _mm_cmple_pd(b, a)
 }

 /// Compare corresponding elements in `a` and `b` to see if neither is `NaN`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_cmpord_pd(a: f64x2, b: f64x2) -> f64x2 {
-    unsafe { cmppd(a, b, 7) }
+#[cfg_attr(test, assert_instr(cmpordpd))]
+pub unsafe fn _mm_cmpord_pd(a: f64x2, b: f64x2) -> f64x2 {
+    cmppd(a, b, 7)
 }

 /// Compare corresponding elements in `a` and `b` to see if either is `NaN`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_cmpunord_pd(a: f64x2, b: f64x2) -> f64x2 {
-    unsafe { cmppd(a, b, 3) }
+#[cfg_attr(test, assert_instr(cmpunordpd))]
+pub unsafe fn _mm_cmpunord_pd(a: f64x2, b: f64x2) -> f64x2 {
+    cmppd(a, b, 3)
 }

 /// Compare corresponding elements in `a` and `b` for not-equal.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_cmpneq_pd(a: f64x2, b: f64x2) -> f64x2 {
-    unsafe { cmppd(a, b, 4) }
+#[cfg_attr(test, assert_instr(cmpneqpd))]
+pub unsafe fn _mm_cmpneq_pd(a: f64x2, b: f64x2) -> f64x2 {
+    cmppd(a, b, 4)
 }

 /// Compare corresponding elements in `a` and `b` for not-less-than.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_cmpnlt_pd(a: f64x2, b: f64x2) -> f64x2 {
-    unsafe { cmppd(a, b, 5) }
+#[cfg_attr(test, assert_instr(cmpnltpd))]
+pub unsafe fn _mm_cmpnlt_pd(a: f64x2, b: f64x2) -> f64x2 {
+    cmppd(a, b, 5)
 }

 /// Compare corresponding elements in `a` and `b` for not-less-than-or-equal.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_cmpnle_pd(a: f64x2, b: f64x2) -> f64x2 {
-    unsafe { cmppd(a, b, 6) }
+#[cfg_attr(test, assert_instr(cmpnlepd))]
+pub unsafe fn _mm_cmpnle_pd(a: f64x2, b: f64x2) -> f64x2 {
+    cmppd(a, b, 6)
 }

 /// Compare corresponding elements in `a` and `b` for not-greater-than.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_cmpngt_pd(a: f64x2, b: f64x2) -> f64x2 {
+#[cfg_attr(test, assert_instr(cmpnltpd))]
+pub unsafe fn _mm_cmpngt_pd(a: f64x2, b: f64x2) -> f64x2 {
    _mm_cmpnlt_pd(b, a)
 }

@@ -1471,92 +1662,105 @@ pub fn _mm_cmpngt_pd(a: f64x2, b: f64x2) -> f64x2 {
 /// not-greater-than-or-equal.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_cmpnge_pd(a: f64x2, b: f64x2) -> f64x2 {
+#[cfg_attr(test, assert_instr(cmpnlepd))]
+pub unsafe fn _mm_cmpnge_pd(a: f64x2, b: f64x2) -> f64x2 {
    _mm_cmpnle_pd(b, a)
 }

 /// Compare the lower element of `a` and `b` for equality.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_comieq_sd(a: f64x2, b: f64x2) -> bool {
-    unsafe { mem::transmute(comieqsd(a, b) as u8) }
+#[cfg_attr(test, assert_instr(comisd))]
+pub unsafe fn _mm_comieq_sd(a: f64x2, b: f64x2) -> bool {
+    mem::transmute(comieqsd(a, b) as u8)
 }

 /// Compare the lower element of `a` and `b` for less-than.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_comilt_sd(a: f64x2, b: f64x2) -> bool {
-    unsafe { mem::transmute(comiltsd(a, b) as u8) }
+#[cfg_attr(test, assert_instr(comisd))]
+pub unsafe fn _mm_comilt_sd(a: f64x2, b: f64x2) -> bool {
+    mem::transmute(comiltsd(a, b) as u8)
 }

 /// Compare the lower element of `a` and `b` for less-than-or-equal.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_comile_sd(a: f64x2, b: f64x2) -> bool {
-    unsafe { mem::transmute(comilesd(a, b) as u8) }
+#[cfg_attr(test, assert_instr(comisd))]
+pub unsafe fn _mm_comile_sd(a: f64x2, b: f64x2) -> bool {
+    mem::transmute(comilesd(a, b) as u8)
 }

 /// Compare the lower element of `a` and `b` for greater-than.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_comigt_sd(a: f64x2, b: f64x2) -> bool {
-    unsafe { mem::transmute(comigtsd(a, b) as u8) }
+#[cfg_attr(test, assert_instr(comisd))]
+pub unsafe fn _mm_comigt_sd(a: f64x2, b: f64x2) -> bool {
+    mem::transmute(comigtsd(a, b) as u8)
 }

 /// Compare the lower element of `a` and `b` for greater-than-or-equal.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_comige_sd(a: f64x2, b: f64x2) -> bool {
-    unsafe { mem::transmute(comigesd(a, b) as u8) }
+#[cfg_attr(test, assert_instr(comisd))]
+pub unsafe fn _mm_comige_sd(a: f64x2, b: f64x2) -> bool {
+    mem::transmute(comigesd(a, b) as u8)
 }

 /// Compare the lower element of `a` and `b` for not-equal.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_comineq_sd(a: f64x2, b: f64x2) -> bool {
-    unsafe { mem::transmute(comineqsd(a, b) as u8) }
+#[cfg_attr(test, assert_instr(comisd))]
+pub unsafe fn _mm_comineq_sd(a: f64x2, b: f64x2) -> bool {
+    mem::transmute(comineqsd(a, b) as u8)
 }

 /// Compare the lower element of `a` and `b` for equality.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_ucomieq_sd(a: f64x2, b: f64x2) -> bool {
-    unsafe { mem::transmute(ucomieqsd(a, b) as u8) }
+#[cfg_attr(test, assert_instr(ucomisd))]
+pub unsafe fn _mm_ucomieq_sd(a: f64x2, b: f64x2) -> bool {
+    mem::transmute(ucomieqsd(a, b) as u8)
 }

 /// Compare the lower element of `a` and `b` for less-than.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_ucomilt_sd(a: f64x2, b: f64x2) -> bool {
-    unsafe { mem::transmute(ucomiltsd(a, b) as u8) }
+#[cfg_attr(test, assert_instr(ucomisd))]
+pub unsafe fn _mm_ucomilt_sd(a: f64x2, b: f64x2) -> bool {
+    mem::transmute(ucomiltsd(a, b) as u8)
 }

 /// Compare the lower element of `a` and `b` for less-than-or-equal.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_ucomile_sd(a: f64x2, b: f64x2) -> bool {
-    unsafe { mem::transmute(ucomilesd(a, b) as u8) }
+#[cfg_attr(test, assert_instr(ucomisd))]
+pub unsafe fn _mm_ucomile_sd(a: f64x2, b: f64x2) -> bool {
+    mem::transmute(ucomilesd(a, b) as u8)
 }

 /// Compare the lower element of `a` and `b` for greater-than.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_ucomigt_sd(a: f64x2, b: f64x2) -> bool {
-    unsafe { mem::transmute(ucomigtsd(a, b) as u8) }
+#[cfg_attr(test, assert_instr(ucomisd))]
+pub unsafe fn _mm_ucomigt_sd(a: f64x2, b: f64x2) -> bool {
+    mem::transmute(ucomigtsd(a, b) as u8)
 }

 /// Compare the lower element of `a` and `b` for greater-than-or-equal.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_ucomige_sd(a: f64x2, b: f64x2) -> bool {
-    unsafe { mem::transmute(ucomigesd(a, b) as u8) }
+#[cfg_attr(test, assert_instr(ucomisd))]
+pub unsafe fn _mm_ucomige_sd(a: f64x2, b: f64x2) -> bool {
+    mem::transmute(ucomigesd(a, b) as u8)
 }

 /// Compare the lower element of `a` and `b` for not-equal.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_ucomineq_sd(a: f64x2, b: f64x2) -> bool {
-    unsafe { mem::transmute(ucomineqsd(a, b) as u8) }
+#[cfg_attr(test, assert_instr(ucomisd))]
+pub unsafe fn _mm_ucomineq_sd(a: f64x2, b: f64x2) -> bool {
+    mem::transmute(ucomineqsd(a, b) as u8)
 }

 /// Return a mask of the most significant bit of each element in `a`.
@@ -1565,8 +1769,9 @@ pub fn _mm_ucomineq_sd(a: f64x2, b: f64x2) -> bool {
 /// All other bits are set to `0`.
 #[inline(always)]
 #[target_feature = "+sse2"]
-pub fn _mm_movemask_pd(a: f64x2) -> i32 {
-    unsafe { movmskpd(a) }
+#[cfg_attr(test, assert_instr(movmskpd))]
+pub unsafe fn _mm_movemask_pd(a: f64x2) -> i32 {
+    movmskpd(a)
 }


@@ -1574,12 +1779,14 @@ pub fn _mm_movemask_pd(a: f64x2) -> i32 {

 #[inline(always)]
 #[target_feature = "+sse2"]
+#[cfg_attr(test, assert_instr(movaps))]
 pub unsafe fn _mm_load_pd(mem_addr: *const f64) -> f64x2 {
    *(mem_addr as *const f64x2)
 }

 #[inline(always)]
 #[target_feature = "+sse2"]
+#[cfg_attr(test, assert_instr(movaps))]
 pub unsafe fn _mm_store_pd(mem_addr: *mut f64, a: f64x2) {
    *(mem_addr as *mut f64x2) = a;
 }
@@ -1730,7 +1937,7 @@ mod tests {

    #[simd_test = "sse2"]
    fn _mm_pause() {
-        sse2::_mm_pause();
+        unsafe { sse2::_mm_pause() };
    }

    #[simd_test = "sse2"]
@@ -1741,12 +1948,12 @@ fn _mm_clflush() {

    #[simd_test = "sse2"]
    fn _mm_lfence() {
-        sse2::_mm_lfence();
+        unsafe { sse2::_mm_lfence() };
    }

    #[simd_test = "sse2"]
    fn _mm_mfence() {
-        sse2::_mm_mfence();
+        unsafe { sse2::_mm_mfence() };
    }

    #[simd_test = "sse2"]
@@ -1755,7 +1962,7 @@ fn _mm_add_epi8() {
            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let b = i8x16::new(
            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
-        let r = sse2::_mm_add_epi8(a, b);
+        let r = unsafe { sse2::_mm_add_epi8(a, b) };
        let e = i8x16::new(
            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46);
        assert_eq!(r, e);
@@ -1765,7 +1972,7 @@ fn _mm_add_epi8() {
    fn _mm_add_epi8_overflow() {
        let a = i8x16::splat(0x7F);
        let b = i8x16::splat(1);
-        let r = sse2::_mm_add_epi8(a, b);
+        let r = unsafe { sse2::_mm_add_epi8(a, b) };
        assert_eq!(r, i8x16::splat(-128));
    }

@@ -1773,7 +1980,7 @@ fn _mm_add_epi8_overflow() {
    fn _mm_add_epi16() {
        let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
        let b = i16x8::new(8, 9, 10, 11, 12, 13, 14, 15);
-        let r = sse2::_mm_add_epi16(a, b);
+        let r = unsafe { sse2::_mm_add_epi16(a, b) };
        let e = i16x8::new(8, 10, 12, 14, 16, 18, 20, 22);
        assert_eq!(r, e);
    }
@@ -1782,7 +1989,7 @@ fn _mm_add_epi16() {
    fn _mm_add_epi32() {
        let a = i32x4::new(0, 1, 2, 3);
        let b = i32x4::new(4, 5, 6, 7);
-        let r = sse2::_mm_add_epi32(a, b);
+        let r = unsafe { sse2::_mm_add_epi32(a, b) };
        let e = i32x4::new(4, 6, 8, 10);
        assert_eq!(r, e);
    }
@@ -1791,7 +1998,7 @@ fn _mm_add_epi32() {
    fn _mm_add_epi64() {
        let a = i64x2::new(0, 1);
        let b = i64x2::new(2, 3);
-        let r = sse2::_mm_add_epi64(a, b);
+        let r = unsafe { sse2::_mm_add_epi64(a, b) };
        let e = i64x2::new(2, 4);
        assert_eq!(r, e);
    }
@@ -1802,7 +2009,7 @@ fn _mm_adds_epi8() {
            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let b = i8x16::new(
            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
-        let r = sse2::_mm_adds_epi8(a, b);
+        let r = unsafe { sse2::_mm_adds_epi8(a, b) };
        let e = i8x16::new(
            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46);
        assert_eq!(r, e);
@@ -1812,7 +2019,7 @@ fn _mm_adds_epi8() {
    fn _mm_adds_epi8_saturate_positive() {
        let a = i8x16::splat(0x7F);
        let b = i8x16::splat(1);
-        let r = sse2::_mm_adds_epi8(a, b);
+        let r = unsafe { sse2::_mm_adds_epi8(a, b) };
        assert_eq!(r, a);
    }

@@ -1820,7 +2027,7 @@ fn _mm_adds_epi8_saturate_positive() {
    fn _mm_adds_epi8_saturate_negative() {
        let a = i8x16::splat(-0x80);
        let b = i8x16::splat(-1);
-        let r = sse2::_mm_adds_epi8(a, b);
+        let r = unsafe { sse2::_mm_adds_epi8(a, b) };
        assert_eq!(r, a);
    }

@@ -1828,7 +2035,7 @@ fn _mm_adds_epi8_saturate_negative() {
    fn _mm_adds_epi16() {
        let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
        let b = i16x8::new(8, 9, 10, 11, 12, 13, 14, 15);
-        let r = sse2::_mm_adds_epi16(a, b);
+        let r = unsafe { sse2::_mm_adds_epi16(a, b) };
        let e = i16x8::new(8, 10, 12, 14, 16, 18, 20, 22);
        assert_eq!(r, e);
    }
@@ -1837,7 +2044,7 @@ fn _mm_adds_epi16() {
    fn _mm_adds_epi16_saturate_positive() {
        let a = i16x8::splat(0x7FFF);
        let b = i16x8::splat(1);
-        let r = sse2::_mm_adds_epi16(a, b);
+        let r = unsafe { sse2::_mm_adds_epi16(a, b) };
        assert_eq!(r, a);
    }

@@ -1845,7 +2052,7 @@ fn _mm_adds_epi16_saturate_positive() {
    fn _mm_adds_epi16_saturate_negative() {
        let a = i16x8::splat(-0x8000);
        let b = i16x8::splat(-1);
-        let r = sse2::_mm_adds_epi16(a, b);
+        let r = unsafe { sse2::_mm_adds_epi16(a, b) };
        assert_eq!(r, a);
    }

@@ -1855,7 +2062,7 @@ fn _mm_adds_epu8() {
            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let b = u8x16::new(
            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
-        let r = sse2::_mm_adds_epu8(a, b);
+        let r = unsafe { sse2::_mm_adds_epu8(a, b) };
        let e = u8x16::new(
            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46);
        assert_eq!(r, e);
@@ -1865,7 +2072,7 @@ fn _mm_adds_epu8() {
    fn _mm_adds_epu8_saturate() {
        let a = u8x16::splat(0xFF);
        let b = u8x16::splat(1);
-        let r = sse2::_mm_adds_epu8(a, b);
+        let r = unsafe { sse2::_mm_adds_epu8(a, b) };
        assert_eq!(r, a);
    }

@@ -1873,7 +2080,7 @@ fn _mm_adds_epu8_saturate() {
    fn _mm_adds_epu16() {
        let a = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
        let b = u16x8::new(8, 9, 10, 11, 12, 13, 14, 15);
-        let r = sse2::_mm_adds_epu16(a, b);
+        let r = unsafe { sse2::_mm_adds_epu16(a, b) };
        let e = u16x8::new(8, 10, 12, 14, 16, 18, 20, 22);
        assert_eq!(r, e);
    }
@@ -1882,21 +2089,21 @@ fn _mm_adds_epu16() {
    fn _mm_adds_epu16_saturate() {
        let a = u16x8::splat(0xFFFF);
        let b = u16x8::splat(1);
-        let r = sse2::_mm_adds_epu16(a, b);
+        let r = unsafe { sse2::_mm_adds_epu16(a, b) };
        assert_eq!(r, a);
    }

    #[simd_test = "sse2"]
    fn _mm_avg_epu8() {
        let (a, b) = (u8x16::splat(3), u8x16::splat(9));
-        let r = sse2::_mm_avg_epu8(a, b);
+        let r = unsafe { sse2::_mm_avg_epu8(a, b) };
        assert_eq!(r, u8x16::splat(6));
    }

    #[simd_test = "sse2"]
    fn _mm_avg_epu16() {
        let (a, b) = (u16x8::splat(3), u16x8::splat(9));
-        let r = sse2::_mm_avg_epu16(a, b);
+        let r = unsafe { sse2::_mm_avg_epu16(a, b) };
        assert_eq!(r, u16x8::splat(6));
    }

@@ -1904,7 +2111,7 @@ fn _mm_avg_epu16() {
    fn _mm_madd_epi16() {
        let a = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
        let b = i16x8::new(9, 10, 11, 12, 13, 14, 15, 16);
-        let r = sse2::_mm_madd_epi16(a, b);
+        let r = unsafe { sse2::_mm_madd_epi16(a, b) };
        let e = i32x4::new(29, 81, 149, 233);
        assert_eq!(r, e);
    }
@@ -1913,7 +2120,7 @@ fn _mm_madd_epi16() {
    fn _mm_max_epi16() {
        let a = i16x8::splat(1);
        let b = i16x8::splat(-1);
-        let r = sse2::_mm_max_epi16(a, b);
+        let r = unsafe { sse2::_mm_max_epi16(a, b) };
        assert_eq!(r, a);
    }

@@ -1921,7 +2128,7 @@ fn _mm_max_epi16() {
    fn _mm_max_epu8() {
        let a = u8x16::splat(1);
        let b = u8x16::splat(255);
-        let r = sse2::_mm_max_epu8(a, b);
+        let r = unsafe { sse2::_mm_max_epu8(a, b) };
        assert_eq!(r, b);
    }

@@ -1929,7 +2136,7 @@ fn _mm_max_epu8() {
    fn _mm_min_epi16() {
        let a = i16x8::splat(1);
        let b = i16x8::splat(-1);
-        let r = sse2::_mm_min_epi16(a, b);
+        let r = unsafe { sse2::_mm_min_epi16(a, b) };
        assert_eq!(r, b);
    }

@@ -1937,28 +2144,28 @@ fn _mm_min_epi16() {
    fn _mm_min_epu8() {
        let a = u8x16::splat(1);
        let b = u8x16::splat(255);
-        let r = sse2::_mm_min_epu8(a, b);
+        let r = unsafe { sse2::_mm_min_epu8(a, b) };
        assert_eq!(r, a);
    }

    #[simd_test = "sse2"]
    fn _mm_mulhi_epi16() {
        let (a, b) = (i16x8::splat(1000), i16x8::splat(-1001));
-        let r = sse2::_mm_mulhi_epi16(a, b);
+        let r = unsafe { sse2::_mm_mulhi_epi16(a, b) };
        assert_eq!(r, i16x8::splat(-16));
    }

    #[simd_test = "sse2"]
    fn _mm_mulhi_epu16() {
        let (a, b) = (u16x8::splat(1000), u16x8::splat(1001));
-        let r = sse2::_mm_mulhi_epu16(a, b);
+        let r = unsafe { sse2::_mm_mulhi_epu16(a, b) };
        assert_eq!(r, u16x8::splat(15));
    }

    #[simd_test = "sse2"]
    fn _mm_mullo_epi16() {
        let (a, b) = (i16x8::splat(1000), i16x8::splat(-1001));
-        let r = sse2::_mm_mullo_epi16(a, b);
+        let r = unsafe { sse2::_mm_mullo_epi16(a, b) };
        assert_eq!(r, i16x8::splat(-17960));
    }

@@ -1966,7 +2173,7 @@ fn _mm_mullo_epi16() {
    fn _mm_mul_epu32() {
        let a = u32x4::from(u64x2::new(1_000_000_000, 1 << 34));
        let b = u32x4::from(u64x2::new(1_000_000_000, 1 << 35));
-        let r = sse2::_mm_mul_epu32(a, b);
+        let r = unsafe { sse2::_mm_mul_epu32(a, b) };
        let e = u64x2::new(1_000_000_000 * 1_000_000_000, 0);
        assert_eq!(r, e);
    }
@@ -1979,7 +2186,7 @@ fn _mm_sad_epu8() {
        let b = u8x16::new(
            0, 0, 0, 0, 2, 1, 2, 1,
            1, 1, 1, 1, 1, 2, 1, 2);
-        let r = sse2::_mm_sad_epu8(a, b);
+        let r = unsafe { sse2::_mm_sad_epu8(a, b) };
        let e = u64x2::new(1020, 614);
        assert_eq!(r, e);
    }
@@ -1987,35 +2194,35 @@ fn _mm_sad_epu8() {
    #[simd_test = "sse2"]
    fn _mm_sub_epi8() {
        let (a, b) = (i8x16::splat(5), i8x16::splat(6));
-        let r = sse2::_mm_sub_epi8(a, b);
+        let r = unsafe { sse2::_mm_sub_epi8(a, b) };
        assert_eq!(r, i8x16::splat(-1));
    }

    #[simd_test = "sse2"]
    fn _mm_sub_epi16() {
        let (a, b) = (i16x8::splat(5), i16x8::splat(6));
-        let r = sse2::_mm_sub_epi16(a, b);
+        let r = unsafe { sse2::_mm_sub_epi16(a, b) };
        assert_eq!(r, i16x8::splat(-1));
    }

    #[simd_test = "sse2"]
    fn _mm_sub_epi32() {
        let (a, b) = (i32x4::splat(5), i32x4::splat(6));
-        let r = sse2::_mm_sub_epi32(a, b);
+        let r = unsafe { sse2::_mm_sub_epi32(a, b) };
        assert_eq!(r, i32x4::splat(-1));
    }

    #[simd_test = "sse2"]
    fn _mm_sub_epi64() {
        let (a, b) = (i64x2::splat(5), i64x2::splat(6));
-        let r = sse2::_mm_sub_epi64(a, b);
+        let r = unsafe { sse2::_mm_sub_epi64(a, b) };
        assert_eq!(r, i64x2::splat(-1));
    }

    #[simd_test = "sse2"]
    fn _mm_subs_epi8() {
        let (a, b) = (i8x16::splat(5), i8x16::splat(2));
-        let r = sse2::_mm_subs_epi8(a, b);
+        let r = unsafe { sse2::_mm_subs_epi8(a, b) };
        assert_eq!(r, i8x16::splat(3));
    }

@@ -2023,7 +2230,7 @@ fn _mm_subs_epi8() {
    fn _mm_subs_epi8_saturate_positive() {
        let a = i8x16::splat(0x7F);
        let b = i8x16::splat(-1);
-        let r = sse2::_mm_subs_epi8(a, b);
+        let r = unsafe { sse2::_mm_subs_epi8(a, b) };
        assert_eq!(r, a);
    }

@@ -2031,14 +2238,14 @@ fn _mm_subs_epi8_saturate_positive() {
    fn _mm_subs_epi8_saturate_negative() {
        let a = i8x16::splat(-0x80);
        let b = i8x16::splat(1);
-        let r = sse2::_mm_subs_epi8(a, b);
+        let r = unsafe { sse2::_mm_subs_epi8(a, b) };
        assert_eq!(r, a);
    }

    #[simd_test = "sse2"]
    fn _mm_subs_epi16() {
        let (a, b) = (i16x8::splat(5), i16x8::splat(2));
-        let r = sse2::_mm_subs_epi16(a, b);
+        let r = unsafe { sse2::_mm_subs_epi16(a, b) };
        assert_eq!(r, i16x8::splat(3));
    }

@@ -2046,7 +2253,7 @@ fn _mm_subs_epi16() {
    fn _mm_subs_epi16_saturate_positive() {
        let a = i16x8::splat(0x7FFF);
        let b = i16x8::splat(-1);
-        let r = sse2::_mm_subs_epi16(a, b);
+        let r = unsafe { sse2::_mm_subs_epi16(a, b) };
        assert_eq!(r, a);
    }

@@ -2054,14 +2261,14 @@ fn _mm_subs_epi16_saturate_positive() {
    fn _mm_subs_epi16_saturate_negative() {
        let a = i16x8::splat(-0x8000);
        let b = i16x8::splat(1);
-        let r = sse2::_mm_subs_epi16(a, b);
+        let r = unsafe { sse2::_mm_subs_epi16(a, b) };
        assert_eq!(r, a);
    }

    #[simd_test = "sse2"]
    fn _mm_subs_epu8() {
        let (a, b) = (u8x16::splat(5), u8x16::splat(2));
-        let r = sse2::_mm_subs_epu8(a, b);
+        let r = unsafe { sse2::_mm_subs_epu8(a, b) };
        assert_eq!(r, u8x16::splat(3));
    }

@@ -2069,14 +2276,14 @@ fn _mm_subs_epu8() {
    fn _mm_subs_epu8_saturate() {
        let a = u8x16::splat(0);
        let b = u8x16::splat(1);
-        let r = sse2::_mm_subs_epu8(a, b);
+        let r = unsafe { sse2::_mm_subs_epu8(a, b) };
        assert_eq!(r, a);
    }

    #[simd_test = "sse2"]
    fn _mm_subs_epu16() {
        let (a, b) = (u16x8::splat(5), u16x8::splat(2));
-        let r = sse2::_mm_subs_epu16(a, b);
+        let r = unsafe { sse2::_mm_subs_epu16(a, b) };
        assert_eq!(r, u16x8::splat(3));
    }

@@ -2084,7 +2291,7 @@ fn _mm_subs_epu16() {
    fn _mm_subs_epu16_saturate() {
        let a = u16x8::splat(0);
        let b = u16x8::splat(1);
-        let r = sse2::_mm_subs_epu16(a, b);
+        let r = unsafe { sse2::_mm_subs_epu16(a, b) };
        assert_eq!(r, a);
    }

@@ -2092,31 +2299,31 @@ fn _mm_subs_epu16_saturate() {
    fn _mm_slli_si128() {
        let a = __m128i::new(
            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let r = sse2::_mm_slli_si128(a, 1);
+        let r = unsafe { sse2::_mm_slli_si128(a, 1) };
        let e = __m128i::new(
            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        assert_eq!(r, e);

        let a = __m128i::new(
            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let r = sse2::_mm_slli_si128(a, 15);
+        let r = unsafe { sse2::_mm_slli_si128(a, 15) };
        let e = __m128i::new(
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
        assert_eq!(r, e);

        let a = __m128i::new(
            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let r = sse2::_mm_slli_si128(a, 16);
+        let r = unsafe { sse2::_mm_slli_si128(a, 16) };
        assert_eq!(r, __m128i::splat(0));

        let a = __m128i::new(
            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let r = sse2::_mm_slli_si128(a, -1);
+        let r = unsafe { sse2::_mm_slli_si128(a, -1) };
        assert_eq!(r, __m128i::splat(0));

        let a = __m128i::new(
            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let r = sse2::_mm_slli_si128(a, -0x80000000);
+        let r = unsafe { sse2::_mm_slli_si128(a, -0x80000000) };
        assert_eq!(r, __m128i::splat(0));
    }

@@ -2124,7 +2331,7 @@ fn _mm_slli_si128() {
    fn _mm_slli_epi16() {
        let a = i16x8::new(
            0xFFFF as u16 as i16, 0x0FFF, 0x00FF, 0x000F, 0, 0, 0, 0);
-        let r = sse2::_mm_slli_epi16(a, 4);
+        let r = unsafe { sse2::_mm_slli_epi16(a, 4) };
        let e = i16x8::new(
            0xFFF0 as u16 as i16,
            0xFFF0 as u16 as i16, 0x0FF0, 0x00F0, 0, 0, 0, 0);
@@ -2134,98 +2341,101 @@ fn _mm_slli_epi16() {
    #[simd_test = "sse2"]
    fn _mm_sll_epi16() {
        let a = i16x8::new(0xFF, 0, 0, 0, 0, 0, 0, 0);
-        let r = sse2::_mm_sll_epi16(a, i16x8::new(4, 0, 0, 0, 0, 0, 0, 0));
+        let r = unsafe {
+            sse2::_mm_sll_epi16(a, i16x8::new(4, 0, 0, 0, 0, 0, 0, 0))
+        };
        assert_eq!(r, i16x8::new(0xFF0, 0, 0, 0, 0, 0, 0, 0));
-        let r = sse2::_mm_sll_epi16(a, i16x8::new(0, 0, 0, 0, 4, 0, 0, 0));
+        let r = unsafe {
+            sse2::_mm_sll_epi16(a, i16x8::new(0, 0, 0, 0, 4, 0, 0, 0))
+        };
        assert_eq!(r, i16x8::new(0xFF, 0, 0, 0, 0, 0, 0, 0));
    }

    #[simd_test = "sse2"]
    fn _mm_slli_epi32() {
-        assert_eq!(
-            sse2::_mm_slli_epi32(i32x4::splat(0xFFFF), 4),
-            i32x4::splat(0xFFFF0));
+        let r = unsafe { sse2::_mm_slli_epi32(i32x4::splat(0xFFFF), 4) };
+        assert_eq!(r, i32x4::splat(0xFFFF0));
    }

    #[simd_test = "sse2"]
    fn _mm_sll_epi32() {
-        assert_eq!(
-            sse2::_mm_sll_epi32(i32x4::splat(0xFFFF), i32x4::new(4, 0, 0, 0)),
-            i32x4::splat(0xFFFF0));
+        let a = i32x4::splat(0xFFFF);
+        let b = i32x4::new(4, 0, 0, 0);
+        let r = unsafe { sse2::_mm_sll_epi32(a, b) };
+        assert_eq!(r, i32x4::splat(0xFFFF0));
    }

    #[simd_test = "sse2"]
    fn _mm_slli_epi64() {
-        assert_eq!(
-            sse2::_mm_slli_epi64(i64x2::splat(0xFFFFFFFF), 4),
-            i64x2::splat(0xFFFFFFFF0));
+        let r = unsafe { sse2::_mm_slli_epi64(i64x2::splat(0xFFFFFFFF), 4) };
+        assert_eq!(r, i64x2::splat(0xFFFFFFFF0));
    }

    #[simd_test = "sse2"]
    fn _mm_sll_epi64() {
-        assert_eq!(
-            sse2::_mm_sll_epi64(
-                i64x2::splat(0xFFFFFFFF), i64x2::new(4, 0)),
-            i64x2::splat(0xFFFFFFFF0));
+        let a = i64x2::splat(0xFFFFFFFF);
+        let b = i64x2::new(4, 0);
+        let r = unsafe { sse2::_mm_sll_epi64(a, b) };
+        assert_eq!(r, i64x2::splat(0xFFFFFFFF0));
    }

    #[simd_test = "sse2"]
    fn _mm_srai_epi16() {
-        assert_eq!(
-            sse2::_mm_srai_epi16(i16x8::splat(-1), 1), i16x8::splat(-1));
+        let r = unsafe { sse2::_mm_srai_epi16(i16x8::splat(-1), 1) };
+        assert_eq!(r, i16x8::splat(-1));
    }

    #[simd_test = "sse2"]
    fn _mm_sra_epi16() {
-        assert_eq!(
-            sse2::_mm_sra_epi16(
-                i16x8::splat(-1), i16x8::new(1, 0, 0, 0, 0, 0, 0, 0)),
-            i16x8::splat(-1));
+        let a = i16x8::splat(-1);
+        let b = i16x8::new(1, 0, 0, 0, 0, 0, 0, 0);
+        let r = unsafe { sse2::_mm_sra_epi16(a, b) };
+        assert_eq!(r, i16x8::splat(-1));
    }

    #[simd_test = "sse2"]
    fn _mm_srai_epi32() {
-        assert_eq!(
-            sse2::_mm_srai_epi32(i32x4::splat(-1), 1), i32x4::splat(-1));
+        let r = unsafe { sse2::_mm_srai_epi32(i32x4::splat(-1), 1) };
+        assert_eq!(r, i32x4::splat(-1));
    }

    #[simd_test = "sse2"]
    fn _mm_sra_epi32() {
-        assert_eq!(
-            sse2::_mm_sra_epi32(
-                i32x4::splat(-1), i32x4::new(1, 0, 0, 0)),
-            i32x4::splat(-1));
+        let a = i32x4::splat(-1);
+        let b = i32x4::new(1, 0, 0, 0);
+        let r = unsafe { sse2::_mm_sra_epi32(a, b) };
+        assert_eq!(r, i32x4::splat(-1));
    }

    #[simd_test = "sse2"]
    fn _mm_srli_si128() {
        let a = __m128i::new(
            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let r = sse2::_mm_srli_si128(a, 1);
+        let r = unsafe { sse2::_mm_srli_si128(a, 1) };
        let e = __m128i::new(
            2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0);
        assert_eq!(r, e);

        let a = __m128i::new(
            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let r = sse2::_mm_srli_si128(a, 15);
+        let r = unsafe { sse2::_mm_srli_si128(a, 15) };
        let e = __m128i::new(
            16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
        assert_eq!(r, e);

        let a = __m128i::new(
            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let r = sse2::_mm_srli_si128(a, 16);
+        let r = unsafe { sse2::_mm_srli_si128(a, 16) };
        assert_eq!(r, __m128i::splat(0));

        let a = __m128i::new(
            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let r = sse2::_mm_srli_si128(a, -1);
+        let r = unsafe { sse2::_mm_srli_si128(a, -1) };
        assert_eq!(r, __m128i::splat(0));

        let a = __m128i::new(
            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let r = sse2::_mm_srli_si128(a, -0x80000000);
+        let r = unsafe { sse2::_mm_srli_si128(a, -0x80000000) };
        assert_eq!(r, __m128i::splat(0));
    }

@@ -2233,7 +2443,7 @@ fn _mm_srli_si128() {
    fn _mm_srli_epi16() {
        let a = i16x8::new(
            0xFFFF as u16 as i16, 0x0FFF, 0x00FF, 0x000F, 0, 0, 0, 0);
-        let r = sse2::_mm_srli_epi16(a, 4);
+        let r = unsafe { sse2::_mm_srli_epi16(a, 4) };
        let e = i16x8::new(
            0xFFF as u16 as i16,
            0xFF as u16 as i16, 0xF, 0, 0, 0, 0, 0);
@@ -2243,67 +2453,74 @@ fn _mm_srli_epi16() {
    #[simd_test = "sse2"]
    fn _mm_srl_epi16() {
        let a = i16x8::new(0xFF, 0, 0, 0, 0, 0, 0, 0);
-        let r = sse2::_mm_srl_epi16(a, i16x8::new(4, 0, 0, 0, 0, 0, 0, 0));
+        let r = unsafe {
+            sse2::_mm_srl_epi16(a, i16x8::new(4, 0, 0, 0, 0, 0, 0, 0))
+        };
        assert_eq!(r, i16x8::new(0xF, 0, 0, 0, 0, 0, 0, 0));
-        let r = sse2::_mm_srl_epi16(a, i16x8::new(0, 0, 0, 0, 4, 0, 0, 0));
+        let r = unsafe {
+            sse2::_mm_srl_epi16(a, i16x8::new(0, 0, 0, 0, 4, 0, 0, 0))
+        };
        assert_eq!(r, i16x8::new(0xFF, 0, 0, 0, 0, 0, 0, 0));
    }

    #[simd_test = "sse2"]
    fn _mm_srli_epi32() {
-        assert_eq!(
-            sse2::_mm_srli_epi32(i32x4::splat(0xFFFF), 4),
-            i32x4::splat(0xFFF));
+        let r = unsafe { sse2::_mm_srli_epi32(i32x4::splat(0xFFFF), 4) };
+        assert_eq!(r, i32x4::splat(0xFFF));
    }

    #[simd_test = "sse2"]
    fn _mm_srl_epi32() {
-        assert_eq!(
-            sse2::_mm_srl_epi32(i32x4::splat(0xFFFF), i32x4::new(4, 0, 0, 0)),
-            i32x4::splat(0xFFF));
+        let a = i32x4::splat(0xFFFF);
+        let b = i32x4::new(4, 0, 0, 0);
+        let r = unsafe { sse2::_mm_srl_epi32(a, b) };
+        assert_eq!(r, i32x4::splat(0xFFF));
    }

    #[simd_test = "sse2"]
    fn _mm_srli_epi64() {
-        assert_eq!(
-            sse2::_mm_srli_epi64(i64x2::splat(0xFFFFFFFF), 4),
-            i64x2::splat(0xFFFFFFF));
+        let r = unsafe { sse2::_mm_srli_epi64(i64x2::splat(0xFFFFFFFF), 4) };
+        assert_eq!(r, i64x2::splat(0xFFFFFFF));
    }

    #[simd_test = "sse2"]
    fn _mm_srl_epi64() {
-        assert_eq!(
-            sse2::_mm_srl_epi64(
-                i64x2::splat(0xFFFFFFFF), i64x2::new(4, 0)),
-            i64x2::splat(0xFFFFFFF));
+        let a = i64x2::splat(0xFFFFFFFF);
+        let b = i64x2::new(4, 0);
+        let r = unsafe { sse2::_mm_srl_epi64(a, b) };
+        assert_eq!(r, i64x2::splat(0xFFFFFFF));
    }

    #[simd_test = "sse2"]
    fn _mm_and_si128() {
-        assert_eq!(
-            sse2::_mm_and_si128(__m128i::splat(5), __m128i::splat(3)),
-            __m128i::splat(1));
+        let a = __m128i::splat(5);
+        let b = __m128i::splat(3);
+        let r = unsafe { sse2::_mm_and_si128(a, b) };
+        assert_eq!(r, __m128i::splat(1));
    }

    #[simd_test = "sse2"]
    fn _mm_andnot_si128() {
-        assert_eq!(
-            sse2::_mm_andnot_si128(__m128i::splat(5), __m128i::splat(3)),
-            __m128i::splat(2));
+        let a = __m128i::splat(5);
+        let b = __m128i::splat(3);
+        let r = unsafe { sse2::_mm_andnot_si128(a, b) };
+        assert_eq!(r, __m128i::splat(2));
    }

    #[simd_test = "sse2"]
    fn _mm_or_si128() {
-        assert_eq!(
-            sse2::_mm_or_si128(__m128i::splat(5), __m128i::splat(3)),
-            __m128i::splat(7));
+        let a = __m128i::splat(5);
+        let b = __m128i::splat(3);
+        let r = unsafe { sse2::_mm_or_si128(a, b) };
+        assert_eq!(r, __m128i::splat(7));
    }

    #[simd_test = "sse2"]
    fn _mm_xor_si128() {
-        assert_eq!(
-            sse2::_mm_xor_si128(__m128i::splat(5), __m128i::splat(3)),
-            __m128i::splat(6));
+        let a = __m128i::splat(5);
+        let b = __m128i::splat(3);
+        let r = unsafe { sse2::_mm_xor_si128(a, b) };
+        assert_eq!(r, __m128i::splat(6));
    }

    #[simd_test = "sse2"]
@@ -2312,7 +2529,7 @@ fn _mm_cmpeq_epi8() {
            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let b = i8x16::new(
            15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-        let r = sse2::_mm_cmpeq_epi8(a, b);
+        let r = unsafe { sse2::_mm_cmpeq_epi8(a, b) };
        assert_eq!(r, i8x16::new(
            0, 0, 0xFFu8 as i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
    }
@@ -2321,7 +2538,7 @@ fn _mm_cmpeq_epi8() {
    fn _mm_cmpeq_epi16() {
        let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
        let b = i16x8::new(7, 6, 2, 4, 3, 2, 1, 0);
-        let r = sse2::_mm_cmpeq_epi16(a, b);
+        let r = unsafe { sse2::_mm_cmpeq_epi16(a, b) };
        assert_eq!(r, i16x8::splat(0).replace(2, 0xFFFFu16 as i16));
    }

@@ -2329,7 +2546,7 @@ fn _mm_cmpeq_epi16() {
    fn _mm_cmpeq_epi32() {
        let a = i32x4::new(0, 1, 2, 3);
        let b = i32x4::new(3, 2, 2, 0);
-        let r = sse2::_mm_cmpeq_epi32(a, b);
+        let r = unsafe { sse2::_mm_cmpeq_epi32(a, b) };
        assert_eq!(r, i32x4::splat(0).replace(2, 0xFFFFFFFFu32 as i32));
    }

@@ -2337,7 +2554,7 @@ fn _mm_cmpeq_epi32() {
    fn _mm_cmpgt_epi8() {
        let a = i8x16::splat(0).replace(0, 5);
        let b = i8x16::splat(0);
-        let r = sse2::_mm_cmpgt_epi8(a, b);
+        let r = unsafe { sse2::_mm_cmpgt_epi8(a, b) };
        assert_eq!(r, i8x16::splat(0).replace(0, 0xFFu8 as i8));
    }

@@ -2345,7 +2562,7 @@ fn _mm_cmpgt_epi8() {
    fn _mm_cmpgt_epi16() {
        let a = i16x8::splat(0).replace(0, 5);
        let b = i16x8::splat(0);
-        let r = sse2::_mm_cmpgt_epi16(a, b);
+        let r = unsafe { sse2::_mm_cmpgt_epi16(a, b) };
        assert_eq!(r, i16x8::splat(0).replace(0, 0xFFFFu16 as i16));
    }

@@ -2353,7 +2570,7 @@ fn _mm_cmpgt_epi16() {
    fn _mm_cmpgt_epi32() {
        let a = i32x4::splat(0).replace(0, 5);
        let b = i32x4::splat(0);
-        let r = sse2::_mm_cmpgt_epi32(a, b);
+        let r = unsafe { sse2::_mm_cmpgt_epi32(a, b) };
        assert_eq!(r, i32x4::splat(0).replace(0, 0xFFFFFFFFu32 as i32));
    }

@@ -2361,7 +2578,7 @@ fn _mm_cmpgt_epi32() {
    fn _mm_cmplt_epi8() {
        let a = i8x16::splat(0);
        let b = i8x16::splat(0).replace(0, 5);
-        let r = sse2::_mm_cmplt_epi8(a, b);
+        let r = unsafe { sse2::_mm_cmplt_epi8(a, b) };
        assert_eq!(r, i8x16::splat(0).replace(0, 0xFFu8 as i8));
    }

@@ -2369,7 +2586,7 @@ fn _mm_cmplt_epi8() {
    fn _mm_cmplt_epi16() {
        let a = i16x8::splat(0);
        let b = i16x8::splat(0).replace(0, 5);
-        let r = sse2::_mm_cmplt_epi16(a, b);
+        let r = unsafe { sse2::_mm_cmplt_epi16(a, b) };
        assert_eq!(r, i16x8::splat(0).replace(0, 0xFFFFu16 as i16));
    }

@@ -2377,123 +2594,161 @@ fn _mm_cmplt_epi16() {
    fn _mm_cmplt_epi32() {
        let a = i32x4::splat(0);
        let b = i32x4::splat(0).replace(0, 5);
-        let r = sse2::_mm_cmplt_epi32(a, b);
+        let r = unsafe { sse2::_mm_cmplt_epi32(a, b) };
        assert_eq!(r, i32x4::splat(0).replace(0, 0xFFFFFFFFu32 as i32));
    }

    #[simd_test = "sse2"]
    fn _mm_cvtepi32_pd() {
-        let a = sse2::_mm_set_epi32(35, 25, 15, 5);
-        let r = sse2::_mm_cvtepi32_pd(a);
+        let a = unsafe { sse2::_mm_set_epi32(35, 25, 15, 5) };
+        let r = unsafe { sse2::_mm_cvtepi32_pd(a) };
        assert_eq!(r, f64x2::new(5.0, 15.0));
    }

    #[simd_test = "sse2"]
    fn _mm_cvtsi32_sd() {
        let a = f64x2::splat(3.5);
-        assert_eq!(sse2::_mm_cvtsi32_sd(a, 5), f64x2::new(5.0, 3.5));
+        let r = unsafe { sse2::_mm_cvtsi32_sd(a, 5) };
+        assert_eq!(r, f64x2::new(5.0, 3.5));
    }

+    #[cfg(target_arch = "x86_64")]
    #[simd_test = "sse2"]
    fn _mm_cvtsi64_sd() {
        let a = f64x2::splat(3.5);
-        assert_eq!(sse2::_mm_cvtsi64_sd(a, 5), f64x2::new(5.0, 3.5));
+        let r = unsafe { sse2::_mm_cvtsi64_sd(a, 5) };
+        assert_eq!(r, f64x2::new(5.0, 3.5));
    }

    #[simd_test = "sse2"]
    fn _mm_cvtepi32_ps() {
        let a = i32x4::new(1, 2, 3, 4);
-        assert_eq!(sse2::_mm_cvtepi32_ps(a), f32x4::new(1.0, 2.0, 3.0, 4.0));
+        let r = unsafe { sse2::_mm_cvtepi32_ps(a) };
+        assert_eq!(r, f32x4::new(1.0, 2.0, 3.0, 4.0));
    }

    #[simd_test = "sse2"]
    fn _mm_cvtsi32_si128() {
-        assert_eq!(sse2::_mm_cvtsi32_si128(5), i32x4::new(5, 0, 0, 0));
+        let r = unsafe { sse2::_mm_cvtsi32_si128(5) };
+        assert_eq!(r, i32x4::new(5, 0, 0, 0));
    }

+    #[cfg(target_arch = "x86_64")]
    #[simd_test = "sse2"]
    fn _mm_cvtsi64_si128() {
-        assert_eq!(sse2::_mm_cvtsi64_si128(5), i64x2::new(5, 0));
+        let r = unsafe { sse2::_mm_cvtsi64_si128(5) };
+        assert_eq!(r, i64x2::new(5, 0));
    }

    #[simd_test = "sse2"]
    fn _mm_cvtsi128_si32() {
-        assert_eq!(sse2::_mm_cvtsi128_si32(i32x4::new(5, 0, 0, 0)), 5);
+        let r = unsafe { sse2::_mm_cvtsi128_si32(i32x4::new(5, 0, 0, 0)) };
+        assert_eq!(r, 5);
    }

+    #[cfg(target_arch = "x86_64")]
    #[simd_test = "sse2"]
    fn _mm_cvtsi128_si64() {
-        assert_eq!(sse2::_mm_cvtsi128_si64(i64x2::new(5, 0)), 5);
+        let r = unsafe { sse2::_mm_cvtsi128_si64(i64x2::new(5, 0)) };
+        assert_eq!(r, 5);
    }

    #[simd_test = "sse2"]
    fn _mm_set_epi64x() {
-        assert_eq!(sse2::_mm_set_epi64x(0, 1), i64x2::new(1, 0));
+        let r = unsafe { sse2::_mm_set_epi64x(0, 1) };
+        assert_eq!(r, i64x2::new(1, 0));
    }

    #[simd_test = "sse2"]
    fn _mm_set_epi32() {
-        assert_eq!(sse2::_mm_set_epi32(0, 1, 2, 3), i32x4::new(3, 2, 1, 0));
+        let r = unsafe { sse2::_mm_set_epi32(0, 1, 2, 3) };
+        assert_eq!(r, i32x4::new(3, 2, 1, 0));
    }

    #[simd_test = "sse2"]
    fn _mm_set_epi16() {
-        assert_eq!(
-            sse2::_mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7),
-            i16x8::new(7, 6, 5, 4, 3, 2, 1, 0));
+        let r = unsafe { sse2::_mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7) };
+        assert_eq!(r, i16x8::new(7, 6, 5, 4, 3, 2, 1, 0));
    }

    #[simd_test = "sse2"]
    fn _mm_set_epi8() {
-        assert_eq!(
+        let r = unsafe {
            sse2::_mm_set_epi8(
-                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
-            i8x16::new(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
+                0, 1, 2, 3,
+                4, 5, 6, 7,
+                8, 9, 10, 11,
+                12, 13, 14, 15,
+            )
+        };
+        let e = i8x16::new(
+            15, 14, 13, 12,
+            11, 10, 9, 8,
+            7, 6, 5, 4,
+            3, 2, 1, 0,
+        );
+        assert_eq!(r, e);
    }

    #[simd_test = "sse2"]
    fn _mm_set1_epi64x() {
-        assert_eq!(sse2::_mm_set1_epi64x(1), i64x2::splat(1));
+        let r = unsafe { sse2::_mm_set1_epi64x(1) };
+        assert_eq!(r, i64x2::splat(1));
    }

    #[simd_test = "sse2"]
    fn _mm_set1_epi32() {
-        assert_eq!(sse2::_mm_set1_epi32(1), i32x4::splat(1));
+        let r = unsafe { sse2::_mm_set1_epi32(1) };
+        assert_eq!(r, i32x4::splat(1));
    }

    #[simd_test = "sse2"]
    fn _mm_set1_epi16() {
-        assert_eq!(sse2::_mm_set1_epi16(1), i16x8::splat(1));
+        let r = unsafe { sse2::_mm_set1_epi16(1) };
+        assert_eq!(r, i16x8::splat(1));
    }

    #[simd_test = "sse2"]
    fn _mm_set1_epi8() {
-        assert_eq!(sse2::_mm_set1_epi8(1), i8x16::splat(1));
+        let r = unsafe { sse2::_mm_set1_epi8(1) };
+        assert_eq!(r, i8x16::splat(1));
    }

    #[simd_test = "sse2"]
    fn _mm_setr_epi32() {
-        assert_eq!(sse2::_mm_setr_epi32(0, 1, 2, 3), i32x4::new(0, 1, 2, 3));
+        let r = unsafe { sse2::_mm_setr_epi32(0, 1, 2, 3) };
+        assert_eq!(r, i32x4::new(0, 1, 2, 3));
    }

    #[simd_test = "sse2"]
    fn _mm_setr_epi16() {
-        assert_eq!(
-            sse2::_mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7),
-            i16x8::new(0, 1, 2, 3, 4, 5, 6, 7));
+        let r = unsafe { sse2::_mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7) };
+        assert_eq!(r, i16x8::new(0, 1, 2, 3, 4, 5, 6, 7));
    }

    #[simd_test = "sse2"]
    fn _mm_setr_epi8() {
-        assert_eq!(
+        let r = unsafe {
            sse2::_mm_setr_epi8(
-                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
-            i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15));
+                0, 1, 2, 3,
+                4, 5, 6, 7,
+                8, 9, 10, 11,
+                12, 13, 14, 15,
+            )
+        };
+        let e = i8x16::new(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, 14, 15,
+        );
+        assert_eq!(r, e);
    }

    #[simd_test = "sse2"]
    fn _mm_setzero_si128() {
-        assert_eq!(sse2::_mm_setzero_si128(), __m128i::from(i64x2::splat(0)));
+        let r = unsafe { sse2::_mm_setzero_si128() };
+        assert_eq!(r, __m128i::from(i64x2::splat(0)));
    }

    #[simd_test = "sse2"]
@@ -2505,14 +2760,14 @@ fn _mm_loadl_epi64() {

    #[simd_test = "sse2"]
    fn _mm_load_si128() {
-        let a = sse2::_mm_set_epi64x(5, 6);
+        let a = unsafe { sse2::_mm_set_epi64x(5, 6) };
        let r = unsafe { sse2::_mm_load_si128(&a as *const _ as *const _) };
        assert_eq!(a, i64x2::from(r));
    }

    #[simd_test = "sse2"]
    fn _mm_loadu_si128() {
-        let a = sse2::_mm_set_epi64x(5, 6);
+        let a = unsafe { sse2::_mm_set_epi64x(5, 6) };
        let r = unsafe { sse2::_mm_loadu_si128(&a as *const _ as *const _) };
        assert_eq!(a, i64x2::from(r));
    }
@@ -2561,14 +2816,15 @@ fn _mm_storel_epi64() {
    #[simd_test = "sse2"]
    fn _mm_move_epi64() {
        let a = i64x2::new(5, 6);
-        assert_eq!(sse2::_mm_move_epi64(a), i64x2::new(5, 0));
+        let r = unsafe { sse2::_mm_move_epi64(a) };
+        assert_eq!(r, i64x2::new(5, 0));
    }

    #[simd_test = "sse2"]
    fn _mm_packs_epi16() {
        let a = i16x8::new(0x80, -0x81, 0, 0, 0, 0, 0, 0);
        let b = i16x8::new(0, 0, 0, 0, 0, 0, -0x81, 0x80);
-        let r = sse2::_mm_packs_epi16(a, b);
+        let r = unsafe { sse2::_mm_packs_epi16(a, b) };
        assert_eq!(r, i8x16::new(
            0x7F, -0x80, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, -0x80, 0x7F));
@@ -2578,7 +2834,7 @@ fn _mm_packs_epi16() {
    fn _mm_packs_epi32() {
        let a = i32x4::new(0x8000, -0x8001, 0, 0);
        let b = i32x4::new(0, 0, -0x8001, 0x8000);
-        let r = sse2::_mm_packs_epi32(a, b);
+        let r = unsafe { sse2::_mm_packs_epi32(a, b) };
        assert_eq!(
            r, i16x8::new(0x7FFF, -0x8000, 0, 0, 0, 0, -0x8000, 0x7FFF));
    }
@@ -2587,7 +2843,7 @@ fn _mm_packs_epi32() {
    fn _mm_packus_epi16() {
        let a = i16x8::new(0x100, -1, 0, 0, 0, 0, 0, 0);
        let b = i16x8::new(0, 0, 0, 0, 0, 0, -1, 0x100);
-        let r = sse2::_mm_packus_epi16(a, b);
+        let r = unsafe { sse2::_mm_packus_epi16(a, b) };
        assert_eq!(r, u8x16::new(
            0xFF, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0xFF));
@@ -2596,13 +2852,15 @@ fn _mm_packus_epi16() {
    #[simd_test = "sse2"]
    fn _mm_extract_epi16() {
        let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        assert_eq!(sse2::_mm_extract_epi16(a, 5), 5);
+        let r = unsafe { sse2::_mm_extract_epi16(a, 5) };
+        assert_eq!(r, 5);
    }

    #[simd_test = "sse2"]
    fn _mm_insert_epi16() {
        let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        assert_eq!(sse2::_mm_insert_epi16(a, 9, 0), a.replace(0, 9));
+        let r = unsafe { sse2::_mm_insert_epi16(a, 9, 0) };
+        assert_eq!(r, a.replace(0, 9));
    }

    #[simd_test = "sse2"]
@@ -2610,28 +2868,32 @@ fn _mm_movemask_epi8() {
        let a = i8x16::from(u8x16::new(
            0b1000_0000, 0b0, 0b1000_0000, 0b01, 0b0101, 0b1111_0000, 0, 0,
            0, 0, 0b1111_0000, 0b0101, 0b01, 0b1000_0000, 0b0, 0b1000_0000));
-        assert_eq!(sse2::_mm_movemask_epi8(a), 0b10100100_00100101);
+        let r = unsafe { sse2::_mm_movemask_epi8(a) };
+        assert_eq!(r, 0b10100100_00100101);
    }

    #[simd_test = "sse2"]
    fn _mm_shuffle_epi32() {
        let a = i32x4::new(5, 10, 15, 20);
+        let r = unsafe { sse2::_mm_shuffle_epi32(a, 0b00_01_01_11) };
        let e = i32x4::new(20, 10, 10, 5);
-        assert_eq!(sse2::_mm_shuffle_epi32(a, 0b00_01_01_11), e);
+        assert_eq!(r, e);
    }

    #[simd_test = "sse2"]
    fn _mm_shufflehi_epi16() {
        let a = i16x8::new(1, 2, 3, 4, 5, 10, 15, 20);
+        let r = unsafe { sse2::_mm_shufflehi_epi16(a, 0b00_01_01_11) };
        let e = i16x8::new(1, 2, 3, 4, 20, 10, 10, 5);
-        assert_eq!(sse2::_mm_shufflehi_epi16(a, 0b00_01_01_11), e);
+        assert_eq!(r, e);
    }

    #[simd_test = "sse2"]
    fn _mm_shufflelo_epi16() {
        let a = i16x8::new(5, 10, 15, 20, 1, 2, 3, 4);
+        let r = unsafe { sse2::_mm_shufflelo_epi16(a, 0b00_01_01_11) };
        let e = i16x8::new(20, 10, 10, 5, 1, 2, 3, 4);
-        assert_eq!(sse2::_mm_shufflelo_epi16(a, 0b00_01_01_11), e);
+        assert_eq!(r, e);
    }

    #[simd_test = "sse2"]
@@ -2640,33 +2902,37 @@ fn _mm_unpackhi_epi8() {
            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let b = i8x16::new(
            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = unsafe { sse2::_mm_unpackhi_epi8(a, b) };
        let e = i8x16::new(
            8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
-        assert_eq!(sse2::_mm_unpackhi_epi8(a, b), e);
+        assert_eq!(r, e);
    }

    #[simd_test = "sse2"]
    fn _mm_unpackhi_epi16() {
        let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
        let b = i16x8::new(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = unsafe { sse2::_mm_unpackhi_epi16(a, b) };
        let e = i16x8::new(4, 12, 5, 13, 6, 14, 7, 15);
-        assert_eq!(sse2::_mm_unpackhi_epi16(a, b), e);
+        assert_eq!(r, e);
    }

    #[simd_test = "sse2"]
    fn _mm_unpackhi_epi32() {
        let a = i32x4::new(0, 1, 2, 3);
        let b = i32x4::new(4, 5, 6, 7);
+        let r = unsafe { sse2::_mm_unpackhi_epi32(a, b) };
        let e = i32x4::new(2, 6, 3, 7);
-        assert_eq!(sse2::_mm_unpackhi_epi32(a, b), e);
+        assert_eq!(r, e);
    }

    #[simd_test = "sse2"]
    fn _mm_unpackhi_epi64() {
        let a = i64x2::new(0, 1);
        let b = i64x2::new(2, 3);
+        let r = unsafe { sse2::_mm_unpackhi_epi64(a, b) };
        let e = i64x2::new(1, 3);
-        assert_eq!(sse2::_mm_unpackhi_epi64(a, b), e);
+        assert_eq!(r, e);
    }

    #[simd_test = "sse2"]
@@ -2675,131 +2941,147 @@ fn _mm_unpacklo_epi8() {
            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let b = i8x16::new(
            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = unsafe { sse2::_mm_unpacklo_epi8(a, b) };
        let e = i8x16::new(
            0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
-        assert_eq!(sse2::_mm_unpacklo_epi8(a, b), e);
+        assert_eq!(r, e);
    }

    #[simd_test = "sse2"]
    fn _mm_unpacklo_epi16() {
        let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
        let b = i16x8::new(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = unsafe { sse2::_mm_unpacklo_epi16(a, b) };
        let e = i16x8::new(0, 8, 1, 9, 2, 10, 3, 11);
-        assert_eq!(sse2::_mm_unpacklo_epi16(a, b), e);
+        assert_eq!(r, e);
    }

    #[simd_test = "sse2"]
    fn _mm_unpacklo_epi32() {
        let a = i32x4::new(0, 1, 2, 3);
        let b = i32x4::new(4, 5, 6, 7);
+        let r = unsafe { sse2::_mm_unpacklo_epi32(a, b) };
        let e = i32x4::new(0, 4, 1, 5);
-        assert_eq!(sse2::_mm_unpacklo_epi32(a, b), e);
+        assert_eq!(r, e);
    }

    #[simd_test = "sse2"]
    fn _mm_unpacklo_epi64() {
        let a = i64x2::new(0, 1);
        let b = i64x2::new(2, 3);
+        let r = unsafe { sse2::_mm_unpacklo_epi64(a, b) };
        let e = i64x2::new(0, 2);
-        assert_eq!(sse2::_mm_unpacklo_epi64(a, b), e);
+        assert_eq!(r, e);
    }

    #[simd_test = "sse2"]
    fn _mm_add_sd() {
-        assert_eq!(
-            sse2::_mm_add_sd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)),
-            f64x2::new(6.0, 2.0));
+        let a = f64x2::new(1.0, 2.0);
+        let b = f64x2::new(5.0, 10.0);
+        let r = unsafe { sse2::_mm_add_sd(a, b) };
+        assert_eq!(r, f64x2::new(6.0, 2.0));
    }

    #[simd_test = "sse2"]
    fn _mm_add_pd() {
-        assert_eq!(
-            sse2::_mm_add_pd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)),
-            f64x2::new(6.0, 12.0));
+        let a = f64x2::new(1.0, 2.0);
+        let b = f64x2::new(5.0, 10.0);
+        let r = unsafe { sse2::_mm_add_pd(a, b) };
+        assert_eq!(r, f64x2::new(6.0, 12.0));
    }

    #[simd_test = "sse2"]
    fn _mm_div_sd() {
-        assert_eq!(
-            sse2::_mm_div_sd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)),
-            f64x2::new(0.2, 2.0));
+        let a = f64x2::new(1.0, 2.0);
+        let b = f64x2::new(5.0, 10.0);
+        let r = unsafe { sse2::_mm_div_sd(a, b) };
+        assert_eq!(r, f64x2::new(0.2, 2.0));
    }

    #[simd_test = "sse2"]
    fn _mm_div_pd() {
-        assert_eq!(
-            sse2::_mm_div_pd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)),
-            f64x2::new(0.2, 0.2));
+        let a = f64x2::new(1.0, 2.0);
+        let b = f64x2::new(5.0, 10.0);
+        let r = unsafe { sse2::_mm_div_pd(a, b) };
+        assert_eq!(r, f64x2::new(0.2, 0.2));
    }

    #[simd_test = "sse2"]
    fn _mm_max_sd() {
-        assert_eq!(
-            sse2::_mm_max_sd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)),
-            f64x2::new(5.0, 2.0));
+        let a = f64x2::new(1.0, 2.0);
+        let b = f64x2::new(5.0, 10.0);
+        let r = unsafe { sse2::_mm_max_sd(a, b) };
+        assert_eq!(r, f64x2::new(5.0, 2.0));
    }

    #[simd_test = "sse2"]
    fn _mm_max_pd() {
-        assert_eq!(
-            sse2::_mm_max_pd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)),
-            f64x2::new(5.0, 10.0));
+        let a = f64x2::new(1.0, 2.0);
+        let b = f64x2::new(5.0, 10.0);
+        let r = unsafe { sse2::_mm_max_pd(a, b) };
+        assert_eq!(r, f64x2::new(5.0, 10.0));
    }

    #[simd_test = "sse2"]
    fn _mm_min_sd() {
-        assert_eq!(
-            sse2::_mm_min_sd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)),
-            f64x2::new(1.0, 2.0));
+        let a = f64x2::new(1.0, 2.0);
+        let b = f64x2::new(5.0, 10.0);
+        let r = unsafe { sse2::_mm_min_sd(a, b) };
+        assert_eq!(r, f64x2::new(1.0, 2.0));
    }

    #[simd_test = "sse2"]
    fn _mm_min_pd() {
-        assert_eq!(
-            sse2::_mm_min_pd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)),
-            f64x2::new(1.0, 2.0));
+        let a = f64x2::new(1.0, 2.0);
+        let b = f64x2::new(5.0, 10.0);
+        let r = unsafe { sse2::_mm_min_pd(a, b) };
+        assert_eq!(r, f64x2::new(1.0, 2.0));
    }

    #[simd_test = "sse2"]
    fn _mm_mul_sd() {
-        assert_eq!(
-            sse2::_mm_mul_sd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)),
-            f64x2::new(5.0, 2.0));
+        let a = f64x2::new(1.0, 2.0);
+        let b = f64x2::new(5.0, 10.0);
+        let r = unsafe { sse2::_mm_mul_sd(a, b) };
+        assert_eq!(r, f64x2::new(5.0, 2.0));
    }

    #[simd_test = "sse2"]
    fn _mm_mul_pd() {
-        assert_eq!(
-            sse2::_mm_mul_pd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)),
-            f64x2::new(5.0, 20.0));
+        let a = f64x2::new(1.0, 2.0);
+        let b = f64x2::new(5.0, 10.0);
+        let r = unsafe { sse2::_mm_mul_pd(a, b) };
+        assert_eq!(r, f64x2::new(5.0, 20.0));
    }

    #[simd_test = "sse2"]
    fn _mm_sqrt_sd() {
-        assert_eq!(
-            sse2::_mm_sqrt_sd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)),
-            f64x2::new(5.0f64.sqrt(), 2.0));
+        let a = f64x2::new(1.0, 2.0);
+        let b = f64x2::new(5.0, 10.0);
+        let r = unsafe { sse2::_mm_sqrt_sd(a, b) };
+        assert_eq!(r, f64x2::new(5.0f64.sqrt(), 2.0));
    }

    #[simd_test = "sse2"]
    fn _mm_sqrt_pd() {
-        assert_eq!(
-            sse2::_mm_sqrt_pd(f64x2::new(1.0, 2.0)),
-            f64x2::new(1.0f64.sqrt(), 2.0f64.sqrt()));
+        let r = unsafe { sse2::_mm_sqrt_pd(f64x2::new(1.0, 2.0)) };
+        assert_eq!(r, f64x2::new(1.0f64.sqrt(), 2.0f64.sqrt()));
    }

    #[simd_test = "sse2"]
    fn _mm_sub_sd() {
-        assert_eq!(
-            sse2::_mm_sub_sd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)),
-            f64x2::new(-4.0, 2.0));
+        let a = f64x2::new(1.0, 2.0);
+        let b = f64x2::new(5.0, 10.0);
+        let r = unsafe { sse2::_mm_sub_sd(a, b) };
+        assert_eq!(r, f64x2::new(-4.0, 2.0));
    }

    #[simd_test = "sse2"]
    fn _mm_sub_pd() {
-        assert_eq!(
-            sse2::_mm_sub_pd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)),
-            f64x2::new(-4.0, -8.0));
+        let a = f64x2::new(1.0, 2.0);
+        let b = f64x2::new(5.0, 10.0);
+        let r = unsafe { sse2::_mm_sub_pd(a, b) };
+        assert_eq!(r, f64x2::new(-4.0, -8.0));
    }

    #[simd_test = "sse2"]
@@ -2809,8 +3091,9 @@ fn _mm_and_pd() {
        unsafe {
            let a: f64x2 = transmute(i64x2::splat(5));
            let b: f64x2 = transmute(i64x2::splat(3));
+            let r = sse2::_mm_and_pd(a, b);
            let e: f64x2 = transmute(i64x2::splat(1));
-            assert_eq!(sse2::_mm_and_pd(a, b), e);
+            assert_eq!(r, e);
        }
    }

@@ -2821,8 +3104,9 @@ fn _mm_andnot_pd() {
        unsafe {
            let a: f64x2 = transmute(i64x2::splat(5));
            let b: f64x2 = transmute(i64x2::splat(3));
+            let r = sse2::_mm_andnot_pd(a, b);
            let e: f64x2 = transmute(i64x2::splat(2));
-            assert_eq!(sse2::_mm_andnot_pd(a, b), e);
+            assert_eq!(r, e);
        }
    }

@@ -2833,8 +3117,9 @@ fn _mm_or_pd() {
        unsafe {
            let a: f64x2 = transmute(i64x2::splat(5));
            let b: f64x2 = transmute(i64x2::splat(3));
+            let r = sse2::_mm_or_pd(a, b);
            let e: f64x2 = transmute(i64x2::splat(7));
-            assert_eq!(sse2::_mm_or_pd(a, b), e);
+            assert_eq!(r, e);
        }
    }

@@ -2845,8 +3130,9 @@ fn _mm_xor_pd() {
        unsafe {
            let a: f64x2 = transmute(i64x2::splat(5));
            let b: f64x2 = transmute(i64x2::splat(3));
+            let r = sse2::_mm_xor_pd(a, b);
            let e: f64x2 = transmute(i64x2::splat(6));
-            assert_eq!(sse2::_mm_xor_pd(a, b), e);
+            assert_eq!(r, e);
        }
    }

@@ -3147,40 +3433,40 @@ fn _mm_comieq_sd() {
        use std::f64::NAN;

        let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0));
-        assert!(sse2::_mm_comieq_sd(a, b));
+        assert!(unsafe { sse2::_mm_comieq_sd(a, b) });

        let (a, b) = (f64x2::new(NAN, 2.0), f64x2::new(1.0, 3.0));
-        assert!(!sse2::_mm_comieq_sd(a, b));
+        assert!(unsafe { !sse2::_mm_comieq_sd(a, b) });
    }

    #[simd_test = "sse2"]
    fn _mm_comilt_sd() {
        let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0));
-        assert!(!sse2::_mm_comilt_sd(a, b));
+        assert!(unsafe { !sse2::_mm_comilt_sd(a, b) });
    }

    #[simd_test = "sse2"]
    fn _mm_comile_sd() {
        let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0));
-        assert!(sse2::_mm_comile_sd(a, b));
+        assert!(unsafe { sse2::_mm_comile_sd(a, b) });
    }

    #[simd_test = "sse2"]
    fn _mm_comigt_sd() {
        let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0));
-        assert!(!sse2::_mm_comigt_sd(a, b));
+        assert!(unsafe { !sse2::_mm_comigt_sd(a, b) });
    }

    #[simd_test = "sse2"]
    fn _mm_comige_sd() {
        let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0));
-        assert!(sse2::_mm_comige_sd(a, b));
+        assert!(unsafe { sse2::_mm_comige_sd(a, b) });
    }

    #[simd_test = "sse2"]
    fn _mm_comineq_sd() {
        let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0));
-        assert!(!sse2::_mm_comineq_sd(a, b));
+        assert!(unsafe { !sse2::_mm_comineq_sd(a, b) });
    }

    #[simd_test = "sse2"]
@@ -3188,48 +3474,48 @@ fn _mm_ucomieq_sd() {
        use std::f64::NAN;

        let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0));
-        assert!(sse2::_mm_ucomieq_sd(a, b));
+        assert!(unsafe { sse2::_mm_ucomieq_sd(a, b) });

        let (a, b) = (f64x2::new(NAN, 2.0), f64x2::new(NAN, 3.0));
-        assert!(!sse2::_mm_ucomieq_sd(a, b));
+        assert!(unsafe { !sse2::_mm_ucomieq_sd(a, b) });
    }

    #[simd_test = "sse2"]
    fn _mm_ucomilt_sd() {
        let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0));
-        assert!(!sse2::_mm_ucomilt_sd(a, b));
+        assert!(unsafe { !sse2::_mm_ucomilt_sd(a, b) });
    }

    #[simd_test = "sse2"]
    fn _mm_ucomile_sd() {
        let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0));
-        assert!(sse2::_mm_ucomile_sd(a, b));
+        assert!(unsafe { sse2::_mm_ucomile_sd(a, b) });
    }

    #[simd_test = "sse2"]
    fn _mm_ucomigt_sd() {
        let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0));
-        assert!(!sse2::_mm_ucomigt_sd(a, b));
+        assert!(unsafe { !sse2::_mm_ucomigt_sd(a, b) });
    }

    #[simd_test = "sse2"]
    fn _mm_ucomige_sd() {
        let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0));
-        assert!(sse2::_mm_ucomige_sd(a, b));
+        assert!(unsafe { sse2::_mm_ucomige_sd(a, b) });
    }

    #[simd_test = "sse2"]
    fn _mm_ucomineq_sd() {
        let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0));
-        assert!(!sse2::_mm_ucomineq_sd(a, b));
+        assert!(unsafe { !sse2::_mm_ucomineq_sd(a, b) });
    }

    #[simd_test = "sse2"]
    fn _mm_movemask_pd() {
-        let r = sse2::_mm_movemask_pd(f64x2::new(-1.0, 5.0));
+        let r = unsafe { sse2::_mm_movemask_pd(f64x2::new(-1.0, 5.0)) };
        assert_eq!(r, 0b01);

-        let r = sse2::_mm_movemask_pd(f64x2::new(-1.0, -5.0));
+        let r = unsafe { sse2::_mm_movemask_pd(f64x2::new(-1.0, -5.0)) };
        assert_eq!(r, 0b11);
    }
 }
@@ -1,18 +1,18 @@
-use v128::*;
-use x86::__m128i;
-
 #[cfg(test)]
 use stdsimd_test::assert_instr;

+use v128::*;
+use x86::__m128i;
+
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pblendvb))]
-pub fn _mm_blendv_epi8(
+pub unsafe fn _mm_blendv_epi8(
    a: __m128i,
    b: __m128i,
    mask: __m128i,
 ) -> __m128i {
-    unsafe { pblendvb(a, b, mask) }
+    pblendvb(a, b, mask)
 }

 /// Returns the dot product of two f64x2 vectors.
@@ -24,15 +24,20 @@ pub fn _mm_blendv_epi8(
 /// the broadcast mask bit is zero then the return component will be zero.
 #[inline(always)]
 #[target_feature = "+sse4.1"]
-pub fn _mm_dp_pd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 {
+pub unsafe fn _mm_dp_pd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 {
    macro_rules! call {
-        ($imm8:expr) => {
-            unsafe { dppd(a, b, $imm8) }
-        }
+        ($imm8:expr) => { dppd(a, b, $imm8) }
    }
    constify_imm8!(imm8, call)
 }

+#[cfg(test)]
+#[target_feature = "+sse4.1"]
+#[cfg_attr(test, assert_instr(dppd))]
+fn _test_mm_dp_pd(a: f64x2, b: f64x2) -> f64x2 {
+    unsafe { _mm_dp_pd(a, b, 0) }
+}
+
 /// Returns the dot product of two f32x4 vectors.
 ///
 /// `imm8[3:0]` is the broadcast mask, and `imm8[7:4]` is the condition mask.
@@ -42,15 +47,20 @@ macro_rules! call {
 /// the broadcast mask bit is zero then the return component will be zero.
 #[inline(always)]
 #[target_feature = "+sse4.1"]
-pub fn _mm_dp_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 {
+pub unsafe fn _mm_dp_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 {
    macro_rules! call {
-        ($imm8:expr) => {
-            unsafe { dpps(a, b, $imm8) }
-        }
+        ($imm8:expr) => { dpps(a, b, $imm8) }
    }
    constify_imm8!(imm8, call)
 }

+#[cfg(test)]
+#[target_feature = "+sse4.1"]
+#[cfg_attr(test, assert_instr(dpps))]
+fn _test_mm_dp_ps(a: f32x4, b: f32x4) -> f32x4 {
+    unsafe { _mm_dp_ps(a, b, 0) }
+}
+
 #[allow(improper_ctypes)]
 extern {
    #[link_name = "llvm.x86.sse41.pblendvb"]
@@ -78,7 +88,7 @@ fn _mm_blendv_epi8() {
            0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
        let e = i8x16::new(
            0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31);
-        assert_eq!(sse41::_mm_blendv_epi8(a, b, mask), e);
+        assert_eq!(unsafe { sse41::_mm_blendv_epi8(a, b, mask) }, e);
    }

    #[simd_test = "sse4.1"]
@@ -86,7 +96,7 @@ fn _mm_dp_pd() {
        let a = f64x2::new(2.0, 3.0);
        let b = f64x2::new(1.0, 4.0);
        let e = f64x2::new(14.0, 0.0);
-        assert_eq!(sse41::_mm_dp_pd(a, b, 0b00110001), e);
+        assert_eq!(unsafe { sse41::_mm_dp_pd(a, b, 0b00110001) }, e);
    }

    #[simd_test = "sse4.1"]
@@ -94,6 +104,6 @@ fn _mm_dp_ps() {
        let a = f32x4::new(2.0, 3.0, 1.0, 10.0);
        let b = f32x4::new(1.0, 4.0, 0.5, 10.0);
        let e = f32x4::new(14.5, 0.0, 14.5, 0.0);
-        assert_eq!(sse41::_mm_dp_ps(a, b, 0b01110101), e);
+        assert_eq!(unsafe { sse41::_mm_dp_ps(a, b, 0b01110101) }, e);
    }
 }
@@ -1,3 +1,6 @@
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
 use x86::__m128i;

 pub const _SIDD_UBYTE_OPS: i8 = 0b00000000;
@@ -19,7 +22,7 @@

 #[inline(always)]
 #[target_feature = "+sse4.2"]
-pub fn _mm_cmpestri(
+pub unsafe fn _mm_cmpestri(
    a: __m128i,
    la: i32,
    b: __m128i,
@@ -27,13 +30,18 @@ pub fn _mm_cmpestri(
    imm8: i8,
 ) -> i32 {
    macro_rules! call {
-        ($imm8:expr) => {
-            unsafe { pcmpestri128(a, la, b, lb, $imm8) }
-        }
+        ($imm8:expr) => { pcmpestri128(a, la, b, lb, $imm8) }
    }
    constify_imm8!(imm8, call)
 }

+#[cfg(test)]
+#[target_feature = "+sse4.2"]
+#[cfg_attr(test, assert_instr(pcmpestri))]
+fn _test_mm_cmpestri(a: __m128i, la: i32, b: __m128i, lb: i32) -> i32 {
+    unsafe { _mm_cmpestri(a, la, b, lb, 0) }
+}
+
 #[allow(improper_ctypes)]
 extern {
    #[link_name = "llvm.x86.sse42.pcmpestri128"]
@@ -53,8 +61,10 @@ fn _mm_cmpestri() {
        let b = &b"foobar          "[..];
        let va = __m128i::from(u8x16::load(a, 0));
        let vb = __m128i::from(u8x16::load(b, 0));
-        let i = sse42::_mm_cmpestri(
-            va, 3, vb, 6, sse42::_SIDD_CMP_EQUAL_ORDERED);
+        let i = unsafe {
+            sse42::_mm_cmpestri(
+                va, 3, vb, 6, sse42::_SIDD_CMP_EQUAL_ORDERED)
+        };
        assert_eq!(3, i);
    }
 }
@@ -1,15 +1,15 @@
-use v128::*;
-
 #[cfg(test)]
 use stdsimd_test::assert_instr;

+use v128::*;
+
 /// Compute the absolute value of packed 8-bit signed integers in `a` and
 /// return the unsigned results.
 #[inline(always)]
 #[target_feature = "+ssse3"]
 #[cfg_attr(test, assert_instr(pabsb))]
-pub fn _mm_abs_epi8(a: i8x16) -> u8x16 {
-    unsafe { pabsb128(a) }
+pub unsafe fn _mm_abs_epi8(a: i8x16) -> u8x16 {
+    pabsb128(a)
 }

 /// Shuffle bytes from `a` according to the content of `b`.
@@ -39,8 +39,8 @@ pub fn _mm_abs_epi8(a: i8x16) -> u8x16 {
 #[inline(always)]
 #[target_feature = "+ssse3"]
 #[cfg_attr(test, assert_instr(pshufb))]
-pub fn _mm_shuffle_epi8(a: u8x16, b: u8x16) -> u8x16 {
-    unsafe { pshufb128(a, b) }
+pub unsafe fn _mm_shuffle_epi8(a: u8x16, b: u8x16) -> u8x16 {
+    pshufb128(a, b)
 }


@@ -48,7 +48,6 @@ pub fn _mm_shuffle_epi8(a: u8x16, b: u8x16) -> u8x16 {
 extern {
    #[link_name = "llvm.x86.ssse3.pabs.b.128"]
    fn pabsb128(a: i8x16) -> u8x16;
-
    #[link_name = "llvm.x86.ssse3.pshuf.b.128"]
    fn pshufb128(a: u8x16, b: u8x16) -> u8x16;
 }
@@ -62,16 +61,31 @@ mod tests {

    #[simd_test = "ssse3"]
    fn _mm_abs_epi8() {
-        let r = ssse3::_mm_abs_epi8(i8x16::splat(-5));
+        let r = unsafe { ssse3::_mm_abs_epi8(i8x16::splat(-5)) };
        assert_eq!(r, u8x16::splat(5));
    }

    #[simd_test = "ssse3"]
    fn _mm_shuffle_epi8() {
-        let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let b = u8x16::new(4, 128, 4, 3, 24, 12, 6, 19, 12, 5, 5, 10, 4, 1, 8, 0);
-        let expected = u8x16::new(5, 0, 5, 4, 9, 13, 7, 4, 13, 6, 6, 11, 5, 2, 9, 1);
-        let r = ssse3::_mm_shuffle_epi8(a, b);
+        let a = u8x16::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            9, 10, 11, 12,
+            13, 14, 15, 16,
+        );
+        let b = u8x16::new(
+            4, 128, 4, 3,
+            24, 12, 6, 19,
+            12, 5, 5, 10,
+            4, 1, 8, 0,
+        );
+        let expected = u8x16::new(
+            5, 0, 5, 4,
+            9, 13, 7, 4,
+            13, 6, 6, 11,
+            5, 2, 9, 1,
+        );
+        let r = unsafe { ssse3::_mm_shuffle_epi8(a, b) };
        assert_eq!(r, expected);
    }
 }
@@ -65,7 +65,7 @@ pub fn _bextr2_u64(a: u64, control: u64) -> u64 {
 #[inline(always)]
 #[target_feature = "+tbm"]
 #[cfg_attr(test, assert_instr(blcfill))]
-pub fn _blcfill_u32(x: u32) -> u32 {
+pub unsafe fn _blcfill_u32(x: u32) -> u32 {
    x & (x.wrapping_add(1))
 }

@@ -76,7 +76,7 @@ pub fn _blcfill_u32(x: u32) -> u32 {
 #[target_feature = "+tbm"]
 #[cfg_attr(test, assert_instr(blcfill))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
-pub fn _blcfill_u64(x: u64) -> u64 {
+pub unsafe fn _blcfill_u64(x: u64) -> u64 {
    x & (x.wrapping_add(1))
 }

@@ -86,7 +86,7 @@ pub fn _blcfill_u64(x: u64) -> u64 {
 #[inline(always)]
 #[target_feature = "+tbm"]
 #[cfg_attr(test, assert_instr(blci))]
-pub fn _blci_u32(x: u32) -> u32 {
+pub unsafe fn _blci_u32(x: u32) -> u32 {
    x | !(x.wrapping_add(1))
 }

@@ -97,7 +97,7 @@ pub fn _blci_u32(x: u32) -> u32 {
 #[target_feature = "+tbm"]
 #[cfg_attr(test, assert_instr(blci))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
-pub fn _blci_u64(x: u64) -> u64 {
+pub unsafe fn _blci_u64(x: u64) -> u64 {
    x | !(x.wrapping_add(1))
 }

@@ -107,7 +107,7 @@ pub fn _blci_u64(x: u64) -> u64 {
 #[inline(always)]
 #[target_feature = "+tbm"]
 #[cfg_attr(test, assert_instr(blcic))]
-pub fn _blcic_u32(x: u32) -> u32 {
+pub unsafe fn _blcic_u32(x: u32) -> u32 {
    !x & (x.wrapping_add(1))
 }

@@ -118,7 +118,7 @@ pub fn _blcic_u32(x: u32) -> u32 {
 #[target_feature = "+tbm"]
 #[cfg_attr(test, assert_instr(blcic))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
-pub fn _blcic_u64(x: u64) -> u64 {
+pub unsafe fn _blcic_u64(x: u64) -> u64 {
    !x & (x.wrapping_add(1))
 }

@@ -128,7 +128,7 @@ pub fn _blcic_u64(x: u64) -> u64 {
 #[inline(always)]
 #[target_feature = "+tbm"]
 #[cfg_attr(test, assert_instr(blcmsk))]
-pub fn _blcmsk_u32(x: u32) -> u32 {
+pub unsafe fn _blcmsk_u32(x: u32) -> u32 {
    x ^ (x.wrapping_add(1))
 }

@@ -139,7 +139,7 @@ pub fn _blcmsk_u32(x: u32) -> u32 {
 #[target_feature = "+tbm"]
 #[cfg_attr(test, assert_instr(blcmsk))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
-pub fn _blcmsk_u64(x: u64) -> u64 {
+pub unsafe fn _blcmsk_u64(x: u64) -> u64 {
    x ^ (x.wrapping_add(1))
 }

@@ -149,7 +149,7 @@ pub fn _blcmsk_u64(x: u64) -> u64 {
 #[inline(always)]
 #[target_feature = "+tbm"]
 #[cfg_attr(test, assert_instr(blcs))]
-pub fn _blcs_u32(x: u32) -> u32 {
+pub unsafe fn _blcs_u32(x: u32) -> u32 {
    x | (x.wrapping_add(1))
 }

@@ -160,7 +160,7 @@ pub fn _blcs_u32(x: u32) -> u32 {
 #[target_feature = "+tbm"]
 #[cfg_attr(test, assert_instr(blcs))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
-pub fn _blcs_u64(x: u64) -> u64 {
+pub unsafe fn _blcs_u64(x: u64) -> u64 {
    x | x.wrapping_add(1)
 }

@@ -170,7 +170,7 @@ pub fn _blcs_u64(x: u64) -> u64 {
 #[inline(always)]
 #[target_feature = "+tbm"]
 #[cfg_attr(test, assert_instr(blsfill))]
-pub fn _blsfill_u32(x: u32) -> u32 {
+pub unsafe fn _blsfill_u32(x: u32) -> u32 {
    x | (x.wrapping_sub(1))
 }

@@ -181,7 +181,7 @@ pub fn _blsfill_u32(x: u32) -> u32 {
 #[target_feature = "+tbm"]
 #[cfg_attr(test, assert_instr(blsfill))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
-pub fn _blsfill_u64(x: u64) -> u64 {
+pub unsafe fn _blsfill_u64(x: u64) -> u64 {
    x | (x.wrapping_sub(1))
 }

@@ -191,7 +191,7 @@ pub fn _blsfill_u64(x: u64) -> u64 {
 #[inline(always)]
 #[target_feature = "+tbm"]
 #[cfg_attr(test, assert_instr(blsic))]
-pub fn _blsic_u32(x: u32) -> u32 {
+pub unsafe fn _blsic_u32(x: u32) -> u32 {
    !x | (x.wrapping_sub(1))
 }

@@ -202,7 +202,7 @@ pub fn _blsic_u32(x: u32) -> u32 {
 #[target_feature = "+tbm"]
 #[cfg_attr(test, assert_instr(blsic))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
-pub fn _blsic_u64(x: u64) -> u64 {
+pub unsafe fn _blsic_u64(x: u64) -> u64 {
    !x | (x.wrapping_sub(1))
 }

@@ -213,7 +213,7 @@ pub fn _blsic_u64(x: u64) -> u64 {
 #[inline(always)]
 #[target_feature = "+tbm"]
 #[cfg_attr(test, assert_instr(t1mskc))]
-pub fn _t1mskc_u32(x: u32) -> u32 {
+pub unsafe fn _t1mskc_u32(x: u32) -> u32 {
    !x | (x.wrapping_add(1))
 }

@@ -225,7 +225,7 @@ pub fn _t1mskc_u32(x: u32) -> u32 {
 #[target_feature = "+tbm"]
 #[cfg_attr(test, assert_instr(t1mskc))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
-pub fn _t1mskc_u64(x: u64) -> u64 {
+pub unsafe fn _t1mskc_u64(x: u64) -> u64 {
    !x | (x.wrapping_add(1))
 }

@@ -236,7 +236,7 @@ pub fn _t1mskc_u64(x: u64) -> u64 {
 #[inline(always)]
 #[target_feature = "+tbm"]
 #[cfg_attr(test, assert_instr(tzmsk))]
-pub fn _tzmsk_u32(x: u32) -> u32 {
+pub unsafe fn _tzmsk_u32(x: u32) -> u32 {
    !x & (x.wrapping_sub(1))
 }

@@ -248,7 +248,7 @@ pub fn _tzmsk_u32(x: u32) -> u32 {
 #[target_feature = "+tbm"]
 #[cfg_attr(test, assert_instr(tzmsk))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
-pub fn _tzmsk_u64(x: u64) -> u64 {
+pub unsafe fn _tzmsk_u64(x: u64) -> u64 {
    !x & (x.wrapping_sub(1))
 }

@@ -272,122 +272,174 @@ fn _bextr_u64() {

    #[simd_test = "tbm"]
    fn _blcfill_u32() {
-        assert_eq!(tbm::_blcfill_u32(0b0101_0111u32), 0b0101_0000u32);
-        assert_eq!(tbm::_blcfill_u32(0b1111_1111u32), 0u32);
+        assert_eq!(
+            unsafe { tbm::_blcfill_u32(0b0101_0111u32) },
+            0b0101_0000u32);
+        assert_eq!(
+            unsafe { tbm::_blcfill_u32(0b1111_1111u32) },
+            0u32);
    }

    #[simd_test = "tbm"]
    #[cfg(not(target_arch = "x86"))]
    fn _blcfill_u64() {
-        assert_eq!(tbm::_blcfill_u64(0b0101_0111u64), 0b0101_0000u64);
-        assert_eq!(tbm::_blcfill_u64(0b1111_1111u64), 0u64);
+        assert_eq!(
+            unsafe { tbm::_blcfill_u64(0b0101_0111u64) },
+            0b0101_0000u64);
+        assert_eq!(
+            unsafe { tbm::_blcfill_u64(0b1111_1111u64) },
+            0u64);
    }

    #[simd_test = "tbm"]
    fn _blci_u32() {
-        assert_eq!(tbm::_blci_u32(0b0101_0000u32),
-                   0b1111_1111_1111_1111_1111_1111_1111_1110u32);
-        assert_eq!(tbm::_blci_u32(0b1111_1111u32),
-                   0b1111_1111_1111_1111_1111_1110_1111_1111u32);
+        assert_eq!(
+            unsafe { tbm::_blci_u32(0b0101_0000u32) },
+            0b1111_1111_1111_1111_1111_1111_1111_1110u32);
+        assert_eq!(
+            unsafe { tbm::_blci_u32(0b1111_1111u32) },
+            0b1111_1111_1111_1111_1111_1110_1111_1111u32);
    }

    #[simd_test = "tbm"]
    #[cfg(not(target_arch = "x86"))]
    fn _blci_u64() {
-        assert_eq!(tbm::_blci_u64(0b0101_0000u64),
-                   0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110u64);
-        assert_eq!(tbm::_blci_u64(0b1111_1111u64),
-                   0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110_1111_1111u64);
+        assert_eq!(
+            unsafe { tbm::_blci_u64(0b0101_0000u64) },
+            0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110u64);
+        assert_eq!(
+            unsafe { tbm::_blci_u64(0b1111_1111u64) },
+            0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110_1111_1111u64);
    }

    #[simd_test = "tbm"]
    fn _blcic_u32() {
-        assert_eq!(tbm::_blcic_u32(0b0101_0001u32), 0b0000_0010u32);
-        assert_eq!(tbm::_blcic_u32(0b1111_1111u32), 0b1_0000_0000u32);
+        assert_eq!(
+            unsafe { tbm::_blcic_u32(0b0101_0001u32) },
+            0b0000_0010u32);
+        assert_eq!(
+            unsafe { tbm::_blcic_u32(0b1111_1111u32) },
+            0b1_0000_0000u32);
    }

    #[simd_test = "tbm"]
    #[cfg(not(target_arch = "x86"))]
    fn _blcic_u64() {
-        assert_eq!(tbm::_blcic_u64(0b0101_0001u64), 0b0000_0010u64);
-        assert_eq!(tbm::_blcic_u64(0b1111_1111u64), 0b1_0000_0000u64);
+        assert_eq!(
+            unsafe { tbm::_blcic_u64(0b0101_0001u64) },
+            0b0000_0010u64);
+        assert_eq!(
+            unsafe { tbm::_blcic_u64(0b1111_1111u64) },
+            0b1_0000_0000u64);
    }

    #[simd_test = "tbm"]
    fn _blcmsk_u32() {
-        assert_eq!(tbm::_blcmsk_u32(0b0101_0001u32), 0b0000_0011u32);
-        assert_eq!(tbm::_blcmsk_u32(0b1111_1111u32), 0b1_1111_1111u32);
+        assert_eq!(
+            unsafe { tbm::_blcmsk_u32(0b0101_0001u32) },
+            0b0000_0011u32);
+        assert_eq!(
+            unsafe { tbm::_blcmsk_u32(0b1111_1111u32) },
+            0b1_1111_1111u32);
    }

    #[simd_test = "tbm"]
    #[cfg(not(target_arch = "x86"))]
    fn _blcmsk_u64() {
-        assert_eq!(tbm::_blcmsk_u64(0b0101_0001u64), 0b0000_0011u64);
-        assert_eq!(tbm::_blcmsk_u64(0b1111_1111u64), 0b1_1111_1111u64);
+        assert_eq!(
+            unsafe { tbm::_blcmsk_u64(0b0101_0001u64) },
+            0b0000_0011u64);
+        assert_eq!(
+            unsafe { tbm::_blcmsk_u64(0b1111_1111u64) },
+            0b1_1111_1111u64);
    }

    #[simd_test = "tbm"]
    fn _blcs_u32() {
-       assert_eq!(tbm::_blcs_u32(0b0101_0001u32), 0b0101_0011u32);
-       assert_eq!(tbm::_blcs_u32(0b1111_1111u32), 0b1_1111_1111u32);
+       assert_eq!(unsafe { tbm::_blcs_u32(0b0101_0001u32) }, 0b0101_0011u32);
+       assert_eq!(unsafe { tbm::_blcs_u32(0b1111_1111u32) }, 0b1_1111_1111u32);
    }

    #[simd_test = "tbm"]
    #[cfg(not(target_arch = "x86"))]
    fn _blcs_u64() {
-       assert_eq!(tbm::_blcs_u64(0b0101_0001u64), 0b0101_0011u64);
-       assert_eq!(tbm::_blcs_u64(0b1111_1111u64), 0b1_1111_1111u64);
+       assert_eq!(unsafe { tbm::_blcs_u64(0b0101_0001u64) }, 0b0101_0011u64);
+       assert_eq!(unsafe { tbm::_blcs_u64(0b1111_1111u64) }, 0b1_1111_1111u64);
    }

    #[simd_test = "tbm"]
    fn _blsfill_u32() {
-        assert_eq!(tbm::_blsfill_u32(0b0101_0100u32), 0b0101_0111u32);
-        assert_eq!(tbm::_blsfill_u32(0u32), 0b1111_1111_1111_1111_1111_1111_1111_1111u32);
+        assert_eq!(
+            unsafe { tbm::_blsfill_u32(0b0101_0100u32) },
+            0b0101_0111u32);
+        assert_eq!(
+            unsafe { tbm::_blsfill_u32(0u32) },
+            0b1111_1111_1111_1111_1111_1111_1111_1111u32);
    }

    #[simd_test = "tbm"]
    #[cfg(not(target_arch = "x86"))]
    fn _blsfill_u64() {
-        assert_eq!(tbm::_blsfill_u64(0b0101_0100u64), 0b0101_0111u64);
-        assert_eq!(tbm::_blsfill_u64(0u64), 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64);
+        assert_eq!(
+            unsafe { tbm::_blsfill_u64(0b0101_0100u64) },
+            0b0101_0111u64);
+        assert_eq!(
+            unsafe { tbm::_blsfill_u64(0u64) },
+            0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64);
    }

    #[simd_test = "tbm"]
    fn _blsic_u32() {
-        assert_eq!(tbm::_blsic_u32(0b0101_0100u32), 0b1111_1111_1111_1111_1111_1111_1111_1011u32);
-        assert_eq!(tbm::_blsic_u32(0u32), 0b1111_1111_1111_1111_1111_1111_1111_1111u32);
+        assert_eq!(
+            unsafe { tbm::_blsic_u32(0b0101_0100u32) },
+            0b1111_1111_1111_1111_1111_1111_1111_1011u32);
+        assert_eq!(
+            unsafe { tbm::_blsic_u32(0u32) },
+            0b1111_1111_1111_1111_1111_1111_1111_1111u32);
    }

    #[simd_test = "tbm"]
    #[cfg(not(target_arch = "x86"))]
    fn _blsic_u64() {
-        assert_eq!(tbm::_blsic_u64(0b0101_0100u64), 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1011u64);
-       assert_eq!(tbm::_blsic_u64(0u64), 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64);
+        assert_eq!(
+            unsafe { tbm::_blsic_u64(0b0101_0100u64) },
+            0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1011u64);
+       assert_eq!(
+           unsafe { tbm::_blsic_u64(0u64) },
+           0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64);
    }

    #[simd_test = "tbm"]
    fn _t1mskc_u32() {
-       assert_eq!(tbm::_t1mskc_u32(0b0101_0111u32), 0b1111_1111_1111_1111_1111_1111_1111_1000u32);
-       assert_eq!(tbm::_t1mskc_u32(0u32), 0b1111_1111_1111_1111_1111_1111_1111_1111u32);
+       assert_eq!(
+           unsafe { tbm::_t1mskc_u32(0b0101_0111u32) },
+           0b1111_1111_1111_1111_1111_1111_1111_1000u32);
+       assert_eq!(
+           unsafe { tbm::_t1mskc_u32(0u32) },
+           0b1111_1111_1111_1111_1111_1111_1111_1111u32);
    }

    #[simd_test = "tbm"]
    #[cfg(not(target_arch = "x86"))]
    fn _t1mksc_u64() {
-       assert_eq!(tbm::_t1mskc_u64(0b0101_0111u64), 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1000u64);
-       assert_eq!(tbm::_t1mskc_u64(0u64), 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64);
+       assert_eq!(
+           unsafe { tbm::_t1mskc_u64(0b0101_0111u64) },
+           0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1000u64);
+       assert_eq!(
+           unsafe { tbm::_t1mskc_u64(0u64) },
+           0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64);
    }

    #[simd_test = "tbm"]
    fn _tzmsk_u32() {
-        assert_eq!(tbm::_tzmsk_u32(0b0101_1000u32), 0b0000_0111u32);
-        assert_eq!(tbm::_tzmsk_u32(0b0101_1001u32), 0b0000_0000u32);
+        assert_eq!(unsafe { tbm::_tzmsk_u32(0b0101_1000u32) }, 0b0000_0111u32);
+        assert_eq!(unsafe { tbm::_tzmsk_u32(0b0101_1001u32) }, 0b0000_0000u32);
    }

    #[simd_test = "tbm"]
    #[cfg(not(target_arch = "x86"))]
    fn _tzmsk_u64() {
-        assert_eq!(tbm::_tzmsk_u64(0b0101_1000u64), 0b0000_0111u64);
-        assert_eq!(tbm::_tzmsk_u64(0b0101_1001u64), 0b0000_0000u64);
+        assert_eq!(unsafe { tbm::_tzmsk_u64(0b0101_1000u64) }, 0b0000_0111u64);
+        assert_eq!(unsafe { tbm::_tzmsk_u64(0b0101_1001u64) }, 0b0000_0000u64);
    }
 }