From d5fd2b09a7c1e29a8bd03e781ab14a6113c81dc6 Mon Sep 17 00:00:00 2001
From: pythoneer <dustin.bensing@googlemail.com>
Date: Sat, 21 Oct 2017 17:46:55 +0200
Subject: [PATCH] sse2 (#131)

* added missing doc _mm_cvtps_pd
added missing doc & test _mm_load_pd
added missing doc & test _mm_store_pd
added _mm_store1_pd
added _mm_store_pd1
added _mm_storer_pd
added _mm_load_pd1
added _mm_loadr_pd
added _mm_loadu_pd

* correct alignments
---
 library/stdarch/src/x86/sse2.rs | 202 +++++++++++++++++++++++++++++++-
 1 file changed, 201 insertions(+), 1 deletion(-)

diff --git a/library/stdarch/src/x86/sse2.rs b/library/stdarch/src/x86/sse2.rs
index c92f7145a15d..4aeae8f8cb17 100644
--- a/library/stdarch/src/x86/sse2.rs
+++ b/library/stdarch/src/x86/sse2.rs
@@ -1726,6 +1726,9 @@ pub unsafe fn _mm_cvtpd_ps(a: f64x2) -> f32x4 {
     cvtpd2ps(a)
 }
 
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
+/// double-precision (64-bit) floating-point elements.
 #[inline(always)]
 #[target_feature = "+sse2"]
 #[cfg_attr(test, assert_instr(cvtps2pd))]
@@ -1873,7 +1876,9 @@ pub unsafe fn _mm_movemask_pd(a: f64x2) -> i32 {
 
 
 
-
+/// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements)
+/// from memory into the returned vector. mem_addr must be aligned on a 16-byte boundary or
+/// a general-protection exception may be generated.
 #[inline(always)]
 #[target_feature = "+sse2"]
 #[cfg_attr(test, assert_instr(movaps))]
@@ -1881,6 +1886,9 @@ pub unsafe fn _mm_load_pd(mem_addr: *const f64) -> f64x2 {
     *(mem_addr as *const f64x2)
 }
 
+/// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a`
+/// into memory. mem_addr must be aligned on a 16-byte boundary or a general-protection exception
+/// may be generated.
 #[inline(always)]
 #[target_feature = "+sse2"]
 #[cfg_attr(test, assert_instr(movaps))]
@@ -1888,6 +1896,35 @@ pub unsafe fn _mm_store_pd(mem_addr: *mut f64, a: f64x2) {
     *(mem_addr as *mut f64x2) = a;
 }
 
+/// Store the lower double-precision (64-bit) floating-point element from `a` into 2 contiguous
+/// elements in memory. `mem_addr` must be aligned on a 16-byte boundary or a general-protection
+/// exception may be generated.
+#[inline(always)]
+#[target_feature = "+sse2"]
+pub unsafe fn _mm_store1_pd(mem_addr: *mut f64, a: f64x2) {
+    let b: f64x2 = simd_shuffle2(a, a, [0, 0]);
+    *(mem_addr as *mut f64x2) = b;
+}
+
+/// Store the lower double-precision (64-bit) floating-point element from `a` into 2 contiguous
+/// elements in memory. `mem_addr` must be aligned on a 16-byte boundary or a general-protection
+/// exception may be generated.
+#[inline(always)]
+#[target_feature = "+sse2"]
+pub unsafe fn _mm_store_pd1(mem_addr: *mut f64, a: f64x2) {
+    let b: f64x2 = simd_shuffle2(a, a, [0, 0]);
+    *(mem_addr as *mut f64x2) = b;
+}
+
+/// Store 2 double-precision (64-bit) floating-point elements from `a` into memory in reverse order.
+/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+#[inline(always)]
+#[target_feature = "+sse2"]
+pub unsafe fn _mm_storer_pd(mem_addr: *mut f64, a: f64x2) {
+    let b: f64x2 = simd_shuffle2(a, a, [1, 0]);
+    *(mem_addr as *mut f64x2) = b;
+}
+
 /// Load a double-precision (64-bit) floating-point element from memory
 /// into both elements of returned vector.
 #[inline(always)]
@@ -1897,6 +1934,41 @@ pub unsafe fn _mm_load1_pd(mem_addr: *const f64) -> f64x2 {
     f64x2::new(d, d)
 }
 
+/// Load a double-precision (64-bit) floating-point element from memory
+/// into both elements of returned vector.
+#[inline(always)]
+#[target_feature = "+sse2"]
+pub unsafe fn _mm_load_pd1(mem_addr: *const f64) -> f64x2 {
+    let d = *mem_addr;
+    f64x2::new(d, d)
+}
+
+/// Load 2 double-precision (64-bit) floating-point elements from memory into the returned vector
+/// in reverse order. mem_addr must be aligned on a 16-byte boundary or a general-protection
+/// exception may be generated.
+#[inline(always)]
+#[target_feature = "+sse2"]
+#[cfg_attr(test, assert_instr(movapd))]
+pub unsafe fn _mm_loadr_pd(mem_addr: *const f64) -> f64x2 {
+    let a = _mm_load_pd(mem_addr);
+    simd_shuffle2(a, a, [1, 0])
+}
+
+/// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements)
+/// from memory into the returned vector. mem_addr does not need to be aligned on any particular
+/// oundary.
+#[inline(always)]
+#[target_feature = "+sse2"]
+#[cfg_attr(test, assert_instr(movups))]
+pub unsafe fn _mm_loadu_pd(mem_addr: *const f64) -> f64x2 {
+    let mut dst = f64x2::splat(mem::uninitialized());
+    ptr::copy_nonoverlapping(
+        mem_addr as *const u8,
+        &mut dst as *mut f64x2 as *mut u8,
+        mem::size_of::<f64x2>());
+    dst
+}
+
 /// Return vector of type __m128d with undefined elements.
 #[inline(always)]
 #[target_feature = "+sse2"]
@@ -2068,6 +2140,7 @@ pub unsafe fn _mm_undefined_pd() -> f64x2 {
 mod tests {
     use std::os::raw::c_void;
     use stdsimd_test::simd_test;
+    use test::black_box;  // Used to inhibit constant-folding.
 
     use v128::*;
     use x86::{__m128i, sse2};
@@ -3587,6 +3660,126 @@ unsafe fn _mm_movemask_pd() {
         assert_eq!(r, 0b11);
     }
 
+    #[simd_test = "sse2"]
+    unsafe fn _mm_load_pd() {
+        let vals = &[1.0f64, 2.0, 3.0, 4.0];
+        let mut d = vals.as_ptr();
+
+        // Align d to 16-byte boundary
+        let mut offset = 0;
+        while (d as usize) & 0xf != 0 {
+            d = d.offset(1 as isize);
+            offset += 1;
+        }
+
+        let r = sse2::_mm_load_pd(d);
+        assert_eq!(r, f64x2::new(1.0, 2.0) + f64x2::splat(offset as f64));
+    }
+
+    #[simd_test = "sse2"]
+    unsafe fn _mm_store_pd() {
+        let mut vals = [0.0f64; 4];
+        let a = f64x2::new(1.0, 2.0);
+        let mut d = vals.as_mut_ptr();
+
+        // Align d to 16-byte boundary
+        let mut offset = 0;
+        while (d as usize) & 0xf != 0 {
+            d = d.offset(1 as isize);
+            offset += 1;
+        }
+
+        sse2::_mm_store_pd(d, *black_box(&a));
+        assert_eq!(vals[offset + 0], 1.0);
+        assert_eq!(vals[offset + 1], 2.0);
+    }
+
+    #[simd_test = "sse2"]
+    unsafe fn _mm_store1_pd() {
+        let mut vals = [0.0f64; 4];
+        let a = f64x2::new(1.0, 2.0);
+        let mut d = vals.as_mut_ptr();
+
+        // Align d to 16-byte boundary
+        let mut offset = 0;
+        while (d as usize) & 0xf != 0 {
+            d = d.offset(1 as isize);
+            offset += 1;
+        }
+
+        sse2::_mm_store1_pd(d, *black_box(&a));
+        assert_eq!(vals[offset + 0], 1.0);
+        assert_eq!(vals[offset + 1], 1.0);
+    }
+
+    #[simd_test = "sse2"]
+    unsafe fn _mm_store_pd1() {
+        let mut vals = [0.0f64; 4];
+        let a = f64x2::new(1.0, 2.0);
+        let mut d = vals.as_mut_ptr();
+
+        // Align d to 16-byte boundary
+        let mut offset = 0;
+        while (d as usize) & 0xf != 0 {
+            d = d.offset(1 as isize);
+            offset += 1;
+        }
+
+        sse2::_mm_store_pd1(d, *black_box(&a));
+        assert_eq!(vals[offset + 0], 1.0);
+        assert_eq!(vals[offset + 1], 1.0);
+    }
+
+    #[simd_test = "sse2"]
+    unsafe fn _mm_storer_pd() {
+        let mut vals = [0.0f64; 4];
+        let a = f64x2::new(1.0, 2.0);
+        let mut d = vals.as_mut_ptr();
+
+        // Align d to 16-byte boundary
+        let mut offset = 0;
+        while (d as usize) & 0xf != 0 {
+            d = d.offset(1 as isize);
+            offset += 1;
+        }
+
+        sse2::_mm_storer_pd(d, *black_box(&a));
+        assert_eq!(vals[offset + 0], 2.0);
+        assert_eq!(vals[offset + 1], 1.0);
+    }
+
+    #[simd_test = "sse2"]
+    unsafe fn _mm_loadr_pd() {
+        let vals = &[1.0f64, 2.0, 3.0, 4.0];
+        let mut d = vals.as_ptr();
+
+        // Align d to 16-byte boundary
+        let mut offset = 0;
+        while (d as usize) & 0xf != 0 {
+            d = d.offset(1 as isize);
+            offset += 1;
+        }
+
+        let r = sse2::_mm_loadr_pd(d);
+        assert_eq!(r, f64x2::new(2.0, 1.0) + f64x2::splat(offset as f64));
+    }
+
+    #[simd_test = "sse2"]
+    unsafe fn _mm_loadu_pd() {
+        let vals = &[1.0f64, 2.0, 3.0, 4.0];
+        let mut d = vals.as_ptr();
+
+        // make sure d is not aligned to 16-byte boundary
+        let mut offset = 0;
+        if (d as usize) & 0xf == 0 {
+            offset = 1;
+            d = d.offset(offset as isize);
+        }
+
+        let r = sse2::_mm_loadu_pd(d);
+        assert_eq!(r, f64x2::new(1.0, 2.0) + f64x2::splat(offset as f64));
+    }
+
     #[simd_test = "sse2"]
     unsafe fn _mm_cvtpd_ps() {
         use std::{f64,f32};
@@ -3795,4 +3988,11 @@ unsafe fn _mm_load1_pd() {
         let r = sse2::_mm_load1_pd(&d);
         assert_eq!(r, f64x2::new(d, d));
     }
+
+    #[simd_test = "sse2"]
+    unsafe fn _mm_load_pd1() {
+        let d = -5.0;
+        let r = sse2::_mm_load_pd1(&d);
+        assert_eq!(r, f64x2::new(d, d));
+    }
 }