Avx (#126)

* avx: _mm256_zextps128_ps256 * avx: _mm256_zextpd128_pd256 * avx: _mm256_set_m128 * avx: _mm256_set_m128d * avx: _mm256_castpd_ps * avx: _mm256_castps_pd * avx: _mm256_castps_si256 * avx: _mm256_castsi256_ps * avx: _mm256_zextsi128_si256 * avx: _mm256_set_m128i
2026-05-28 20:16:58 +03:00 · 2017-10-17 00:14:09 +02:00
parent 3286bbbab7
commit 19e7d0ed3e
1 changed files with 178 additions and 1 deletions
@@ -7,6 +7,7 @@
 use simd_llvm::{simd_cast, simd_shuffle2, simd_shuffle4, simd_shuffle8};
 use v128::{f32x4, f64x2, i32x4, i64x2};
 use v256::*;
+use x86::{__m128i, __m256i};

 /// Add packed double-precision (64-bit) floating-point elements
 /// in `a` and `b`.
@@ -1827,6 +1828,34 @@ pub unsafe fn _mm256_set1_epi64x(a: i64) -> i64x4 {
    i64x4::new(a, a, a, a)
 }

+/// Cast vector of type __m256d to type __m256.
+#[inline(always)]
+#[target_feature = "+avx"]
+pub unsafe fn _mm256_castpd_ps(a: f64x4) -> f32x8 {
+    mem::transmute(a)
+}
+
+/// Cast vector of type __m256 to type __m256d.
+#[inline(always)]
+#[target_feature = "+avx"]
+pub unsafe fn _mm256_castps_pd(a: f32x8) -> f64x4 {
+    mem::transmute(a)
+}
+
+/// Casts vector of type __m256 to type __m256i.
+#[inline(always)]
+#[target_feature = "+avx"]
+pub unsafe fn _mm256_castps_si256(a: f32x8) -> i64x4 {
+    mem::transmute(a)
+}
+
+/// Casts vector of type __m256i to type __m256.
+#[inline(always)]
+#[target_feature = "+avx"]
+pub unsafe fn _mm256_castsi256_ps(a: i64x4) -> f32x8 {
+    mem::transmute(a)
+}
+
 /// Casts vector of type __m256d to type __m256i.
 /// This intrinsic is only used for compilation and does not generate any
 /// instructions, thus it has zero latency.
@@ -1899,6 +1928,37 @@ pub unsafe fn _mm256_castsi128_si256(a: i64x2) -> i64x4 {
    simd_shuffle4(a, a, [0, 1, 0, 0])
 }

+/// Constructs a 256-bit floating-point vector of [8 x float] from a
+/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
+/// the value of the source vector. The upper 128 bits are set to zero.
+#[inline(always)]
+#[target_feature = "+avx,+sse"]
+pub unsafe fn _mm256_zextps128_ps256(a: f32x4) -> f32x8 {
+    use x86::sse::_mm_setzero_ps;
+    simd_shuffle8(a, _mm_setzero_ps(), [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Constructs a 256-bit integer vector from a 128-bit integer vector.
+/// The lower 128 bits contain the value of the source vector. The upper
+/// 128 bits are set to zero.
+#[inline(always)]
+#[target_feature = "+avx,+sse2"]
+pub unsafe fn _mm256_zextsi128_si256(a: i64x2) -> i64x4 {
+    use x86::sse2::_mm_setzero_si128;
+    simd_shuffle4(a, mem::transmute(_mm_setzero_si128()), [0, 1, 2, 3])
+}
+
+/// Constructs a 256-bit floating-point vector of [4 x double] from a
+/// 128-bit floating-point vector of [2 x double]. The lower 128 bits
+/// contain the value of the source vector. The upper 128 bits are set
+/// to zero.
+#[inline(always)]
+#[target_feature = "+avx,+sse2"]
+pub unsafe fn _mm256_zextpd128_pd256(a: f64x2) -> f64x4 {
+    use x86::sse2::_mm_setzero_pd;
+    simd_shuffle4(a, _mm_setzero_pd(), [0, 1, 2, 3])
+}
+
 /// Return vector of type `f32x8` with undefined elements.
 #[inline(always)]
 #[target_feature = "+avx"]
@@ -1920,6 +1980,34 @@ pub unsafe fn _mm256_undefined_si256() -> i64x4 {
    i64x4::splat(mem::uninitialized())
 }

+/// Set packed __m256 returned vector with the supplied values.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vinsertf128))]
+pub unsafe fn _mm256_set_m128(hi: f32x4, lo: f32x4) -> f32x8 {
+    simd_shuffle8(lo, hi, [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Set packed __m256d returned vector with the supplied values.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vinsertf128))]
+pub unsafe fn _mm256_set_m128d(hi: f64x2, lo: f64x2) -> f64x4 {
+    let hi: f32x4 = mem::transmute(hi);
+    let lo: f32x4 = mem::transmute(lo);
+    mem::transmute(_mm256_set_m128(hi, lo))
+}
+
+/// Set packed __m256i returned vector with the supplied values.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vinsertf128))]
+pub unsafe fn _mm256_set_m128i(hi: __m128i, lo: __m128i) -> __m256i {
+    let hi: f32x4 = mem::transmute(hi);
+    let lo: f32x4 = mem::transmute(lo);
+    mem::transmute(_mm256_set_m128(hi, lo))
+}
+
 /// LLVM intrinsics used in the above functions
 #[allow(improper_ctypes)]
 extern "C" {
@@ -2070,7 +2158,7 @@ mod tests {
    use stdsimd_test::simd_test;
    use test::black_box;  // Used to inhibit constant-folding.

-    use v128::{f32x4, f64x2, i32x4, i64x2};
+    use v128::{f32x4, f64x2, i8x16, i32x4, i64x2};
    use v256::*;
    use x86::avx;

@@ -3390,6 +3478,38 @@ unsafe fn _mm256_set1_epi64x() {
        assert_eq!(r, i64x4::splat(1));
    }

+    #[simd_test = "avx"]
+    unsafe fn _mm256_castpd_ps() {
+        let a = f64x4::new(1., 2., 3., 4.);
+        let r = avx::_mm256_castpd_ps(a);
+        let e = f32x8::new(0., 1.875, 0., 2., 0., 2.125, 0., 2.25);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_castps_pd() {
+        let a = f32x8::new(0., 1.875, 0., 2., 0., 2.125, 0., 2.25);
+        let r = avx::_mm256_castps_pd(a);
+        let e = f64x4::new(1., 2., 3., 4.);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_castps_si256() {
+        let a = f32x8::new(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = avx::_mm256_castps_si256(a);
+        let e = i64x4::new(4611686019492741120, 4647714816524288000, 4665729215040061440, 4683743613553737728);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_castsi256_ps() {
+        let a = i64x4::new(4611686019492741120, 4647714816524288000, 4665729215040061440, 4683743613553737728);
+        let r = avx::_mm256_castsi256_ps(a);
+        let e = f32x8::new(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq!(r, e);
+    }
+
    #[simd_test = "avx"]
    unsafe fn _mm256_castpd_si256() {
        let a = f64x4::new(1., 2., 3., 4.);
@@ -3424,4 +3544,61 @@ unsafe fn _mm256_castsi256_si128() {
        let r = avx::_mm256_castsi256_si128(a);
        assert_eq!(r, i64x2::new(1, 2));
    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_zextps128_ps256() {
+        let a = f32x4::new(1., 2., 3., 4.);
+        let r = avx::_mm256_zextps128_ps256(a);
+        let e = f32x8::new(1., 2., 3., 4., 0., 0., 0., 0.);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_zextsi128_si256() {
+        let a = i64x2::new(1, 2);
+        let r = avx::_mm256_zextsi128_si256(a);
+        let e = i64x4::new(1, 2, 0, 0);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_zextpd128_pd256() {
+        let a = f64x2::new(1., 2.);
+        let r = avx::_mm256_zextpd128_pd256(a);
+        let e = f64x4::new(1., 2., 0., 0.);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_set_m128() {
+        let hi = f32x4::new(5., 6., 7., 8.);
+        let lo = f32x4::new(1., 2., 3., 4.);
+        let r = avx::_mm256_set_m128(hi, lo);
+        let e = f32x8::new(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_set_m128d() {
+        let hi = f64x2::new(3., 4.);
+        let lo = f64x2::new(1., 2.);
+        let r = avx::_mm256_set_m128d(hi, lo);
+        let e = f64x4::new(1., 2., 3., 4.);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_set_m128i() {
+        let hi = i8x16::new(17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32);
+        let lo = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16);
+        let r = avx::_mm256_set_m128i(hi, lo);
+        let e = i8x32::new(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32);
+        assert_eq!(r, e);
+    }
 }