Merge branch 'master' into master

2026-05-29 20:46:07 +03:00 · 2017-09-26 19:37:59 -04:00
parent e1dd2ae43c 60fbf44231
commit d5ed881685
27 changed files with 784 additions and 1519 deletions
@@ -19,4 +19,5 @@ debug = true
 opt-level = 3

 [dev-dependencies]
-assert-instr = { path = "assert-instr" }
+stdsimd-test = { path = "stdsimd-test" }
+cupid = "0.3"
@@ -1,926 +0,0 @@
-**TIP**: Use the following command to generate a section in this list for
-Intel intrinsics. Replace `SSE4.2` with the intended type.
-
-```
-rg '^<intrinsic' intel-intrinsics-3.3.15.xml | rg "'SSE4.2'" | rg '^.*name=\x27([^\x27]+)\x27.*$' -r '* [ ] `$1`' >> TODO.md
-```
-
-rg calls the ripgrep tool, which can be installed with `cargo install ripgrep`
-
-sse
---
-* [ ] `_MM_TRANSPOSE4_PS`
-* [ ] `_mm_getcsr`
-* [ ] `_mm_setcsr`
-* [ ] `_MM_GET_EXCEPTION_STATE`
-* [ ] `_MM_SET_EXCEPTION_STATE`
-* [ ] `_MM_GET_EXCEPTION_MASK`
-* [ ] `_MM_SET_EXCEPTION_MASK`
-* [ ] `_MM_GET_ROUNDING_MODE`
-* [ ] `_MM_SET_ROUNDING_MODE`
-* [ ] `_MM_GET_FLUSH_ZERO_MODE`
-* [ ] `_MM_SET_FLUSH_ZERO_MODE`
-* [ ] `_mm_prefetch`
-* [ ] `_mm_sfence`
-* [ ] `_mm_max_pi16`
-* [ ] `_m_pmaxsw`
-* [ ] `_mm_max_pu8`
-* [ ] `_m_pmaxub`
-* [ ] `_mm_min_pi16`
-* [ ] `_m_pminsw`
-* [ ] `_mm_min_pu8`
-* [ ] `_m_pminub`
-* [ ] `_mm_mulhi_pu16`
-* [ ] `_m_pmulhuw`
-* [ ] `_mm_avg_pu8`
-* [ ] `_m_pavgb`
-* [ ] `_mm_avg_pu16`
-* [ ] `_m_pavgw`
-* [ ] `_mm_sad_pu8`
-* [ ] `_m_psadbw`
-* [ ] `_mm_cvtsi32_ss`
-* [ ] `_mm_cvt_si2ss`
-* [ ] `_mm_cvtsi64_ss`
-* [ ] `_mm_cvtpi32_ps`
-* [ ] `_mm_cvt_pi2ps`
-* [ ] `_mm_cvtpi16_ps`
-* [ ] `_mm_cvtpu16_ps`
-* [ ] `_mm_cvtpi8_ps`
-* [ ] `_mm_cvtpu8_ps`
-* [ ] `_mm_cvtpi32x2_ps`
-* [ ] `_mm_stream_pi`
-* [ ] `_mm_maskmove_si64`
-* [ ] `_m_maskmovq`
-* [ ] `_mm_extract_pi16`
-* [ ] `_m_pextrw`
-* [ ] `_mm_insert_pi16`
-* [ ] `_m_pinsrw`
-* [ ] `_mm_movemask_pi8`
-* [ ] `_m_pmovmskb`
-* [ ] `_mm_shuffle_pi16`
-* [ ] `_m_pshufw`
-* [x] `_mm_add_ss`
-* [x] `_mm_add_ps`
-* [x] `_mm_sub_ss`
-* [x] `_mm_sub_ps`
-* [x] `_mm_mul_ss`
-* [x] `_mm_mul_ps`
-* [x] `_mm_div_ss`
-* [x] `_mm_div_ps`
-* [x] `_mm_sqrt_ss`
-* [x] `_mm_sqrt_ps`
-* [x] `_mm_rcp_ss`
-* [x] `_mm_rcp_ps`
-* [x] `_mm_rsqrt_ss`
-* [x] `_mm_rsqrt_ps`
-* [x] `_mm_min_ss`
-* [x] `_mm_min_ps`
-* [x] `_mm_max_ss`
-* [x] `_mm_max_ps`
-* [ ] `_mm_and_ps`
-* [ ] `_mm_andnot_ps`
-* [ ] `_mm_or_ps`
-* [ ] `_mm_xor_ps`
-* [ ] `_mm_cmpeq_ss`
-* [ ] `_mm_cmpeq_ps`
-* [ ] `_mm_cmplt_ss`
-* [ ] `_mm_cmplt_ps`
-* [ ] `_mm_cmple_ss`
-* [ ] `_mm_cmple_ps`
-* [ ] `_mm_cmpgt_ss`
-* [ ] `_mm_cmpgt_ps`
-* [ ] `_mm_cmpge_ss`
-* [ ] `_mm_cmpge_ps`
-* [ ] `_mm_cmpneq_ss`
-* [ ] `_mm_cmpneq_ps`
-* [ ] `_mm_cmpnlt_ss`
-* [ ] `_mm_cmpnlt_ps`
-* [ ] `_mm_cmpnle_ss`
-* [ ] `_mm_cmpnle_ps`
-* [ ] `_mm_cmpngt_ss`
-* [ ] `_mm_cmpngt_ps`
-* [ ] `_mm_cmpnge_ss`
-* [ ] `_mm_cmpnge_ps`
-* [ ] `_mm_cmpord_ss`
-* [ ] `_mm_cmpord_ps`
-* [ ] `_mm_cmpunord_ss`
-* [ ] `_mm_cmpunord_ps`
-* [ ] `_mm_comieq_ss`
-* [ ] `_mm_comilt_ss`
-* [ ] `_mm_comile_ss`
-* [ ] `_mm_comigt_ss`
-* [ ] `_mm_comige_ss`
-* [ ] `_mm_comineq_ss`
-* [ ] `_mm_ucomieq_ss`
-* [ ] `_mm_ucomilt_ss`
-* [ ] `_mm_ucomile_ss`
-* [ ] `_mm_ucomigt_ss`
-* [ ] `_mm_ucomige_ss`
-* [ ] `_mm_ucomineq_ss`
-* [ ] `_mm_cvtss_si32`
-* [ ] `_mm_cvt_ss2si`
-* [ ] `_mm_cvtss_si64`
-* [ ] `_mm_cvtss_f32`
-* [ ] `_mm_cvtps_pi32`
-* [ ] `_mm_cvt_ps2pi`
-* [ ] `_mm_cvttss_si32`
-* [ ] `_mm_cvtt_ss2si`
-* [ ] `_mm_cvttss_si64`
-* [ ] `_mm_cvttps_pi32`
-* [ ] `_mm_cvtt_ps2pi`
-* [ ] `_mm_cvtps_pi16`
-* [ ] `_mm_cvtps_pi8`
-* [ ] `_mm_set_ss`
-* [ ] `_mm_set1_ps`
-* [ ] `_mm_set_ps1`
-* [ ] `_mm_set_ps`
-* [ ] `_mm_setr_ps`
-* [ ] `_mm_setzero_ps`
-* [ ] `_mm_loadh_pi`
-* [ ] `_mm_loadl_pi`
-* [ ] `_mm_load_ss`
-* [ ] `_mm_load1_ps`
-* [ ] `_mm_load_ps1`
-* [ ] `_mm_load_ps`
-* [ ] `_mm_loadu_ps`
-* [ ] `_mm_loadr_ps`
-* [ ] `_mm_stream_ps`
-* [ ] `_mm_storeh_pi`
-* [ ] `_mm_storel_pi`
-* [ ] `_mm_store_ss`
-* [ ] `_mm_store1_ps`
-* [ ] `_mm_store_ps1`
-* [ ] `_mm_store_ps`
-* [ ] `_mm_storeu_ps`
-* [ ] `_mm_storer_ps`
-* [ ] `_mm_move_ss`
-* [ ] `_mm_shuffle_ps`
-* [x] `_mm_unpackhi_ps`
-* [ ] `_mm_unpacklo_ps`
-* [ ] `_mm_movehl_ps`
-* [ ] `_mm_movelh_ps`
-* [x] `_mm_movemask_ps`
-* [ ] `_mm_undefined_ps`
-
-
-sse2
----
-* [x] `_mm_pause`
-* [x] `_mm_clflush`
-* [x] `_mm_lfence`
-* [x] `_mm_mfence`
-* [x] `_mm_add_epi8`
-* [x] `_mm_add_epi16`
-* [x] `_mm_add_epi32`
-* [ ] `_mm_add_si64`
-* [x] `_mm_add_epi64`
-* [x] `_mm_adds_epi8`
-* [x] `_mm_adds_epi16`
-* [x] `_mm_adds_epu8`
-* [x] `_mm_adds_epu16`
-* [x] `_mm_avg_epu8`
-* [x] `_mm_avg_epu16`
-* [x] `_mm_madd_epi16`
-* [x] `_mm_max_epi16`
-* [x] `_mm_max_epu8`
-* [x] `_mm_min_epi16`
-* [x] `_mm_min_epu8`
-* [x] `_mm_mulhi_epi16`
-* [x] `_mm_mulhi_epu16`
-* [x] `_mm_mullo_epi16`
-* [ ] `_mm_mul_su32`
-* [x] `_mm_mul_epu32`
-* [x] `_mm_sad_epu8`
-* [x] `_mm_sub_epi8`
-* [x] `_mm_sub_epi16`
-* [x] `_mm_sub_epi32`
-* [ ] `_mm_sub_si64`
-* [x] `_mm_sub_epi64`
-* [x] `_mm_subs_epi8`
-* [x] `_mm_subs_epi16`
-* [x] `_mm_subs_epu8`
-* [x] `_mm_subs_epu16`
-* [x] `_mm_slli_si128`
-* [x] `_mm_bslli_si128`
-* [x] `_mm_bsrli_si128`
-* [x] `_mm_slli_epi16`
-* [x] `_mm_sll_epi16`
-* [x] `_mm_slli_epi32`
-* [x] `_mm_sll_epi32`
-* [x] `_mm_slli_epi64`
-* [x] `_mm_sll_epi64`
-* [x] `_mm_srai_epi16`
-* [x] `_mm_sra_epi16`
-* [x] `_mm_srai_epi32`
-* [x] `_mm_sra_epi32`
-* [x] `_mm_srli_si128`
-* [x] `_mm_srli_epi16`
-* [x] `_mm_srl_epi16`
-* [x] `_mm_srli_epi32`
-* [x] `_mm_srl_epi32`
-* [x] `_mm_srli_epi64`
-* [x] `_mm_srl_epi64`
-* [x] `_mm_and_si128`
-* [x] `_mm_andnot_si128`
-* [x] `_mm_or_si128`
-* [x] `_mm_xor_si128`
-* [x] `_mm_cmpeq_epi8`
-* [x] `_mm_cmpeq_epi16`
-* [x] `_mm_cmpeq_epi32`
-* [x] `_mm_cmpgt_epi8`
-* [x] `_mm_cmpgt_epi16`
-* [x] `_mm_cmpgt_epi32`
-* [x] `_mm_cmplt_epi8`
-* [x] `_mm_cmplt_epi16`
-* [x] `_mm_cmplt_epi32`
-* [x] `_mm_cvtepi32_pd`
-* [x] `_mm_cvtsi32_sd`
-* [x] `_mm_cvtsi64_sd`
-* [x] `_mm_cvtsi64x_sd`
-* [x] `_mm_cvtepi32_ps`
-* [ ] `_mm_cvtpi32_pd`
-* [x] `_mm_cvtsi32_si128`
-* [x] `_mm_cvtsi64_si128`
-* [x] `_mm_cvtsi64x_si128`
-* [x] `_mm_cvtsi128_si32`
-* [x] `_mm_cvtsi128_si64`
-* [x] `_mm_cvtsi128_si64x`
-* [ ] `_mm_set_epi64`
-* [x] `_mm_set_epi64x`
-* [x] `_mm_set_epi32`
-* [x] `_mm_set_epi16`
-* [x] `_mm_set_epi8`
-* [ ] `_mm_set1_epi64`
-* [x] `_mm_set1_epi64x`
-* [x] `_mm_set1_epi32`
-* [x] `_mm_set1_epi16`
-* [x] `_mm_set1_epi8`
-* [ ] `_mm_setr_epi64`
-* [x] `_mm_setr_epi32`
-* [x] `_mm_setr_epi16`
-* [x] `_mm_setr_epi8`
-* [x] `_mm_setzero_si128`
-* [x] `_mm_loadl_epi64`
-* [x] `_mm_load_si128`
-* [x] `_mm_loadu_si128`
-* [x] `_mm_maskmoveu_si128`
-* [x] `_mm_store_si128`
-* [x] `_mm_storeu_si128`
-* [x] `_mm_storel_epi64`
-* [ ] `_mm_stream_si128`
-* [ ] `_mm_stream_si32`
-* [ ] `_mm_stream_si64`
-* [ ] `_mm_movepi64_pi64`
-* [ ] `_mm_movpi64_epi64`
-* [x] `_mm_move_epi64`
-* [x] `_mm_packs_epi16`
-* [x] `_mm_packs_epi32`
-* [x] `_mm_packus_epi16`
-* [x] `_mm_extract_epi16`
-* [x] `_mm_insert_epi16`
-* [x] `_mm_movemask_epi8`
-* [x] `_mm_shuffle_epi32`
-* [x] `_mm_shufflehi_epi16`
-* [x] `_mm_shufflelo_epi16`
-* [x] `_mm_unpackhi_epi8`
-* [x] `_mm_unpackhi_epi16`
-* [x] `_mm_unpackhi_epi32`
-* [x] `_mm_unpackhi_epi64`
-* [x] `_mm_unpacklo_epi8`
-* [x] `_mm_unpacklo_epi16`
-* [x] `_mm_unpacklo_epi32`
-* [x] `_mm_unpacklo_epi64`
-* [x] `_mm_add_sd`
-* [x] `_mm_add_pd`
-* [x] `_mm_div_sd`
-* [x] `_mm_div_pd`
-* [x] `_mm_max_sd`
-* [x] `_mm_max_pd`
-* [x] `_mm_min_sd`
-* [x] `_mm_min_pd`
-* [x] `_mm_mul_sd`
-* [x] `_mm_mul_pd`
-* [x] `_mm_sqrt_sd`
-* [x] `_mm_sqrt_pd`
-* [x] `_mm_sub_sd`
-* [x] `_mm_sub_pd`
-* [x] `_mm_and_pd`
-* [x] `_mm_andnot_pd`
-* [x] `_mm_or_pd`
-* [x] `_mm_xor_pd`
-* [x] `_mm_cmpeq_sd`
-* [x] `_mm_cmplt_sd`
-* [x] `_mm_cmple_sd`
-* [x] `_mm_cmpgt_sd`
-* [x] `_mm_cmpge_sd`
-* [x] `_mm_cmpord_sd`
-* [x] `_mm_cmpunord_sd`
-* [x] `_mm_cmpneq_sd`
-* [x] `_mm_cmpnlt_sd`
-* [x] `_mm_cmpnle_sd`
-* [x] `_mm_cmpngt_sd`
-* [x] `_mm_cmpnge_sd`
-* [x] `_mm_cmpeq_pd`
-* [x] `_mm_cmplt_pd`
-* [x] `_mm_cmple_pd`
-* [x] `_mm_cmpgt_pd`
-* [x] `_mm_cmpge_pd`
-* [x] `_mm_cmpord_pd`
-* [x] `_mm_cmpunord_pd`
-* [x] `_mm_cmpneq_pd`
-* [x] `_mm_cmpnlt_pd`
-* [x] `_mm_cmpnle_pd`
-* [x] `_mm_cmpngt_pd`
-* [x] `_mm_cmpnge_pd`
-* [x] `_mm_comieq_sd`
-* [x] `_mm_comilt_sd`
-* [x] `_mm_comile_sd`
-* [x] `_mm_comigt_sd`
-* [x] `_mm_comige_sd`
-* [x] `_mm_comineq_sd`
-* [x] `_mm_ucomieq_sd`
-* [x] `_mm_ucomilt_sd`
-* [x] `_mm_ucomile_sd`
-* [x] `_mm_ucomigt_sd`
-* [x] `_mm_ucomige_sd`
-* [x] `_mm_ucomineq_sd`
-* [ ] `_mm_cvtpd_ps`
-* [ ] `_mm_cvtps_pd`
-* [ ] `_mm_cvtpd_epi32`
-* [ ] `_mm_cvtsd_si32`
-* [ ] `_mm_cvtsd_si64`
-* [ ] `_mm_cvtsd_si64x`
-* [ ] `_mm_cvtsd_ss`
-* [ ] `_mm_cvtsd_f64`
-* [ ] `_mm_cvtss_sd`
-* [ ] `_mm_cvttpd_epi32`
-* [ ] `_mm_cvttsd_si32`
-* [ ] `_mm_cvttsd_si64`
-* [ ] `_mm_cvttsd_si64x`
-* [ ] `_mm_cvtps_epi32`
-* [ ] `_mm_cvttps_epi32`
-* [ ] `_mm_cvtpd_pi32`
-* [ ] `_mm_cvttpd_pi32`
-* [ ] `_mm_set_sd`
-* [ ] `_mm_set1_pd`
-* [ ] `_mm_set_pd1`
-* [ ] `_mm_set_pd`
-* [ ] `_mm_setr_pd`
-* [ ] `_mm_setzero_pd`
-* [ ] `_mm_load_pd`
-* [ ] `_mm_load1_pd`
-* [ ] `_mm_load_pd1`
-* [ ] `_mm_loadr_pd`
-* [ ] `_mm_loadu_pd`
-* [ ] `_mm_load_sd`
-* [ ] `_mm_loadh_pd`
-* [ ] `_mm_loadl_pd`
-* [ ] `_mm_stream_pd`
-* [ ] `_mm_store_sd`
-* [ ] `_mm_store1_pd`
-* [ ] `_mm_store_pd1`
-* [ ] `_mm_store_pd`
-* [ ] `_mm_storeu_pd`
-* [ ] `_mm_storer_pd`
-* [ ] `_mm_storeh_pd`
-* [ ] `_mm_storel_pd`
-* [ ] `_mm_unpackhi_pd`
-* [ ] `_mm_unpacklo_pd`
-* [x] `_mm_movemask_pd`
-* [ ] `_mm_shuffle_pd`
-* [ ] `_mm_move_sd`
-* [ ] `_mm_castpd_ps`
-* [ ] `_mm_castpd_si128`
-* [ ] `_mm_castps_pd`
-* [ ] `_mm_castps_si128`
-* [ ] `_mm_castsi128_pd`
-* [ ] `_mm_castsi128_ps`
-* [ ] `_mm_undefined_pd`
-* [ ] `_mm_undefined_si128`
-
-
-sse3
----
-* [ ] `_mm_addsub_ps`
-* [ ] `_mm_addsub_pd`
-* [ ] `_mm_hadd_pd`
-* [ ] `_mm_hadd_ps`
-* [ ] `_mm_hsub_pd`
-* [ ] `_mm_hsub_ps`
-* [ ] `_mm_lddqu_si128`
-* [ ] `_mm_movedup_pd`
-* [ ] `_mm_loaddup_pd`
-* [ ] `_mm_movehdup_ps`
-* [ ] `_mm_moveldup_ps`
-
-
-ssse3
-----
-* [ ] `_mm_abs_pi8`
-* [x] `_mm_abs_epi8`
-* [ ] `_mm_abs_pi16`
-* [ ] `_mm_abs_epi16`
-* [ ] `_mm_abs_pi32`
-* [ ] `_mm_abs_epi32`
-* [x] `_mm_shuffle_epi8`
-* [ ] `_mm_shuffle_pi8`
-* [ ] `_mm_alignr_epi8`
-* [ ] `_mm_alignr_pi8`
-* [ ] `_mm_hadd_epi16`
-* [ ] `_mm_hadds_epi16`
-* [ ] `_mm_hadd_epi32`
-* [ ] `_mm_hadd_pi16`
-* [ ] `_mm_hadd_pi32`
-* [ ] `_mm_hadds_pi16`
-* [ ] `_mm_hsub_epi16`
-* [ ] `_mm_hsubs_epi16`
-* [ ] `_mm_hsub_epi32`
-* [ ] `_mm_hsub_pi16`
-* [ ] `_mm_hsub_pi32`
-* [ ] `_mm_hsubs_pi16`
-* [ ] `_mm_maddubs_epi16`
-* [ ] `_mm_maddubs_pi16`
-* [ ] `_mm_mulhrs_epi16`
-* [ ] `_mm_mulhrs_pi16`
-* [ ] `_mm_sign_epi8`
-* [ ] `_mm_sign_epi16`
-* [ ] `_mm_sign_epi32`
-* [ ] `_mm_sign_pi8`
-* [ ] `_mm_sign_pi16`
-* [ ] `_mm_sign_pi32`
-
-
-sse4.1
------
-* [ ] `_mm_blend_pd`
-* [ ] `_mm_blend_ps`
-* [ ] `_mm_blendv_pd`
-* [ ] `_mm_blendv_ps`
-* [x] `_mm_blendv_epi8`
-* [ ] `_mm_blend_epi16`
-* [x] `_mm_dp_pd`
-* [x] `_mm_dp_ps`
-* [ ] `_mm_extract_ps`
-* [ ] `_mm_extract_epi8`
-* [ ] `_mm_extract_epi32`
-* [ ] `_mm_extract_epi64`
-* [ ] `_mm_insert_ps`
-* [ ] `_mm_insert_epi8`
-* [ ] `_mm_insert_epi32`
-* [ ] `_mm_insert_epi64`
-* [ ] `_mm_max_epi8`
-* [ ] `_mm_max_epi32`
-* [ ] `_mm_max_epu32`
-* [ ] `_mm_max_epu16`
-* [ ] `_mm_min_epi8`
-* [ ] `_mm_min_epi32`
-* [ ] `_mm_min_epu32`
-* [ ] `_mm_min_epu16`
-* [ ] `_mm_packus_epi32`
-* [ ] `_mm_cmpeq_epi64`
-* [ ] `_mm_cvtepi8_epi16`
-* [ ] `_mm_cvtepi8_epi32`
-* [ ] `_mm_cvtepi8_epi64`
-* [ ] `_mm_cvtepi16_epi32`
-* [ ] `_mm_cvtepi16_epi64`
-* [ ] `_mm_cvtepi32_epi64`
-* [ ] `_mm_cvtepu8_epi16`
-* [ ] `_mm_cvtepu8_epi32`
-* [ ] `_mm_cvtepu8_epi64`
-* [ ] `_mm_cvtepu16_epi32`
-* [ ] `_mm_cvtepu16_epi64`
-* [ ] `_mm_cvtepu32_epi64`
-* [ ] `_mm_mul_epi32`
-* [ ] `_mm_mullo_epi32`
-* [ ] `_mm_testz_si128`
-* [ ] `_mm_testc_si128`
-* [ ] `_mm_testnzc_si128`
-* [ ] `_mm_test_all_zeros`
-* [ ] `_mm_test_mix_ones_zeros`
-* [ ] `_mm_test_all_ones`
-* [ ] `_mm_round_pd`
-* [ ] `_mm_floor_pd`
-* [ ] `_mm_ceil_pd`
-* [ ] `_mm_round_ps`
-* [ ] `_mm_floor_ps`
-* [ ] `_mm_ceil_ps`
-* [ ] `_mm_round_sd`
-* [ ] `_mm_floor_sd`
-* [ ] `_mm_ceil_sd`
-* [ ] `_mm_round_ss`
-* [ ] `_mm_floor_ss`
-* [ ] `_mm_ceil_ss`
-* [ ] `_mm_minpos_epu16`
-* [ ] `_mm_mpsadbw_epu8`
-* [ ] `_mm_stream_load_si128`
-
-
-sse4.2
------
-* [ ] `_mm_cmpistrm`
-* [ ] `_mm_cmpistri`
-* [ ] `_mm_cmpistrz`
-* [ ] `_mm_cmpistrc`
-* [ ] `_mm_cmpistrs`
-* [ ] `_mm_cmpistro`
-* [ ] `_mm_cmpistra`
-* [ ] `_mm_cmpestrm`
-* [ ] `_mm_cmpestri`
-* [ ] `_mm_cmpestrz`
-* [ ] `_mm_cmpestrc`
-* [ ] `_mm_cmpestrs`
-* [ ] `_mm_cmpestro`
-* [ ] `_mm_cmpestra`
-* [ ] `_mm_cmpgt_epi64`
-* [ ] `_mm_crc32_u8`
-* [ ] `_mm_crc32_u16`
-* [ ] `_mm_crc32_u32`
-* [ ] `_mm_crc32_u64`
-
-
-avx
---
-* [x] `_mm256_add_pd`
-* [x] `_mm256_add_ps`
-* [x] `_mm256_addsub_pd`
-* [ ] `_mm256_addsub_ps`
-* [ ] `_mm256_and_pd`
-* [ ] `_mm256_and_ps`
-* [ ] `_mm256_andnot_pd`
-* [ ] `_mm256_andnot_ps`
-* [ ] `_mm256_blend_pd`
-* [ ] `_mm256_blend_ps`
-* [ ] `_mm256_blendv_pd`
-* [ ] `_mm256_blendv_ps`
-* [ ] `_mm256_div_pd`
-* [ ] `_mm256_div_ps`
-* [ ] `_mm256_dp_ps`
-* [ ] `_mm256_hadd_pd`
-* [ ] `_mm256_hadd_ps`
-* [ ] `_mm256_hsub_pd`
-* [ ] `_mm256_hsub_ps`
-* [ ] `_mm256_max_pd`
-* [ ] `_mm256_max_ps`
-* [ ] `_mm256_min_pd`
-* [ ] `_mm256_min_ps`
-* [ ] `_mm256_mul_pd`
-* [ ] `_mm256_mul_ps`
-* [ ] `_mm256_or_pd`
-* [ ] `_mm256_or_ps`
-* [ ] `_mm256_shuffle_pd`
-* [ ] `_mm256_shuffle_ps`
-* [ ] `_mm256_sub_pd`
-* [ ] `_mm256_sub_ps`
-* [ ] `_mm256_xor_pd`
-* [ ] `_mm256_xor_ps`
-* [ ] `_mm_cmp_pd`
-* [ ] `_mm256_cmp_pd`
-* [ ] `_mm_cmp_ps`
-* [ ] `_mm256_cmp_ps`
-* [ ] `_mm_cmp_sd`
-* [ ] `_mm_cmp_ss`
-* [ ] `_mm256_cvtepi32_pd`
-* [ ] `_mm256_cvtepi32_ps`
-* [ ] `_mm256_cvtpd_ps`
-* [ ] `_mm256_cvtps_epi32`
-* [ ] `_mm256_cvtps_pd`
-* [ ] `_mm256_cvttpd_epi32`
-* [ ] `_mm256_cvtpd_epi32`
-* [ ] `_mm256_cvttps_epi32`
-* [ ] `_mm256_extractf128_ps`
-* [ ] `_mm256_extractf128_pd`
-* [ ] `_mm256_extractf128_si256`
-* [ ] `_mm256_extract_epi8`
-* [ ] `_mm256_extract_epi16`
-* [ ] `_mm256_extract_epi32`
-* [ ] `_mm256_extract_epi64`
-* [ ] `_mm256_zeroall`
-* [ ] `_mm256_zeroupper`
-* [ ] `_mm256_permutevar_ps`
-* [ ] `_mm_permutevar_ps`
-* [ ] `_mm256_permute_ps`
-* [ ] `_mm_permute_ps`
-* [ ] `_mm256_permutevar_pd`
-* [ ] `_mm_permutevar_pd`
-* [ ] `_mm256_permute_pd`
-* [ ] `_mm_permute_pd`
-* [ ] `_mm256_permute2f128_ps`
-* [ ] `_mm256_permute2f128_pd`
-* [ ] `_mm256_permute2f128_si256`
-* [ ] `_mm256_broadcast_ss`
-* [ ] `_mm_broadcast_ss`
-* [ ] `_mm256_broadcast_sd`
-* [ ] `_mm256_broadcast_ps`
-* [ ] `_mm256_broadcast_pd`
-* [ ] `_mm256_insertf128_ps`
-* [ ] `_mm256_insertf128_pd`
-* [ ] `_mm256_insertf128_si256`
-* [ ] `_mm256_insert_epi8`
-* [ ] `_mm256_insert_epi16`
-* [ ] `_mm256_insert_epi32`
-* [ ] `_mm256_insert_epi64`
-* [ ] `_mm256_load_pd`
-* [ ] `_mm256_store_pd`
-* [ ] `_mm256_load_ps`
-* [ ] `_mm256_store_ps`
-* [ ] `_mm256_loadu_pd`
-* [ ] `_mm256_storeu_pd`
-* [ ] `_mm256_loadu_ps`
-* [ ] `_mm256_storeu_ps`
-* [ ] `_mm256_load_si256`
-* [ ] `_mm256_store_si256`
-* [ ] `_mm256_loadu_si256`
-* [ ] `_mm256_storeu_si256`
-* [ ] `_mm256_maskload_pd`
-* [ ] `_mm256_maskstore_pd`
-* [ ] `_mm_maskload_pd`
-* [ ] `_mm_maskstore_pd`
-* [ ] `_mm256_maskload_ps`
-* [ ] `_mm256_maskstore_ps`
-* [ ] `_mm_maskload_ps`
-* [ ] `_mm_maskstore_ps`
-* [ ] `_mm256_movehdup_ps`
-* [ ] `_mm256_moveldup_ps`
-* [ ] `_mm256_movedup_pd`
-* [ ] `_mm256_lddqu_si256`
-* [ ] `_mm256_stream_si256`
-* [ ] `_mm256_stream_pd`
-* [ ] `_mm256_stream_ps`
-* [ ] `_mm256_rcp_ps`
-* [ ] `_mm256_rsqrt_ps`
-* [ ] `_mm256_sqrt_pd`
-* [ ] `_mm256_sqrt_ps`
-* [ ] `_mm256_round_pd`
-* [ ] `_mm256_round_ps`
-* [ ] `_mm256_unpackhi_pd`
-* [ ] `_mm256_unpackhi_ps`
-* [ ] `_mm256_unpacklo_pd`
-* [ ] `_mm256_unpacklo_ps`
-* [ ] `_mm256_testz_si256`
-* [ ] `_mm256_testc_si256`
-* [ ] `_mm256_testnzc_si256`
-* [ ] `_mm256_testz_pd`
-* [ ] `_mm256_testc_pd`
-* [ ] `_mm256_testnzc_pd`
-* [ ] `_mm_testz_pd`
-* [ ] `_mm_testc_pd`
-* [ ] `_mm_testnzc_pd`
-* [ ] `_mm256_testz_ps`
-* [ ] `_mm256_testc_ps`
-* [ ] `_mm256_testnzc_ps`
-* [ ] `_mm_testz_ps`
-* [ ] `_mm_testc_ps`
-* [ ] `_mm_testnzc_ps`
-* [ ] `_mm256_movemask_pd`
-* [ ] `_mm256_movemask_ps`
-* [ ] `_mm256_setzero_pd`
-* [ ] `_mm256_setzero_ps`
-* [ ] `_mm256_setzero_si256`
-* [ ] `_mm256_set_pd`
-* [ ] `_mm256_set_ps`
-* [ ] `_mm256_set_epi8`
-* [ ] `_mm256_set_epi16`
-* [ ] `_mm256_set_epi32`
-* [ ] `_mm256_set_epi64x`
-* [ ] `_mm256_setr_pd`
-* [ ] `_mm256_setr_ps`
-* [ ] `_mm256_setr_epi8`
-* [ ] `_mm256_setr_epi16`
-* [ ] `_mm256_setr_epi32`
-* [ ] `_mm256_setr_epi64x`
-* [ ] `_mm256_set1_pd`
-* [ ] `_mm256_set1_ps`
-* [ ] `_mm256_set1_epi8`
-* [ ] `_mm256_set1_epi16`
-* [ ] `_mm256_set1_epi32`
-* [ ] `_mm256_set1_epi64x`
-* [ ] `_mm256_castpd_ps`
-* [ ] `_mm256_castps_pd`
-* [ ] `_mm256_castps_si256`
-* [ ] `_mm256_castpd_si256`
-* [ ] `_mm256_castsi256_ps`
-* [ ] `_mm256_castsi256_pd`
-* [ ] `_mm256_castps256_ps128`
-* [ ] `_mm256_castpd256_pd128`
-* [ ] `_mm256_castsi256_si128`
-* [ ] `_mm256_castps128_ps256`
-* [ ] `_mm256_castpd128_pd256`
-* [ ] `_mm256_castsi128_si256`
-* [ ] `_mm256_zextps128_ps256`
-* [ ] `_mm256_zextpd128_pd256`
-* [ ] `_mm256_zextsi128_si256`
-* [ ] `_mm256_floor_ps`
-* [ ] `_mm256_ceil_ps`
-* [ ] `_mm256_floor_pd`
-* [ ] `_mm256_ceil_pd`
-* [ ] `_mm256_undefined_ps`
-* [ ] `_mm256_undefined_pd`
-* [ ] `_mm256_undefined_si256`
-* [ ] `_mm256_set_m128`
-* [ ] `_mm256_set_m128d`
-* [ ] `_mm256_set_m128i`
-* [ ] `_mm256_setr_m128`
-* [ ] `_mm256_setr_m128d`
-* [ ] `_mm256_setr_m128i`
-* [ ] `_mm256_loadu2_m128`
-* [ ] `_mm256_loadu2_m128d`
-* [ ] `_mm256_loadu2_m128i`
-* [ ] `_mm256_storeu2_m128`
-* [ ] `_mm256_storeu2_m128d`
-* [ ] `_mm256_storeu2_m128i`
-
-
-
-avx2
----
-* [x] `_mm256_abs_epi8`
-* [x] `_mm256_abs_epi16`
-* [x] `_mm256_abs_epi32`
-* [x] `_mm256_add_epi8`
-* [x] `_mm256_add_epi16`
-* [x] `_mm256_add_epi32`
-* [x] `_mm256_add_epi64`
-* [x] `_mm256_adds_epi8`
-* [x] `_mm256_adds_epi16`
-* [x] `_mm256_adds_epu8`
-* [x] `_mm256_adds_epu16`
-* [ ] `_mm256_alignr_epi8`
-* [x] `_mm256_and_si256`
-* [x] `_mm256_andnot_si256`
-* [x] `_mm256_avg_epu8`
-* [x] `_mm256_avg_epu16`
-* [ ] `_mm256_blend_epi16`
-* [ ] `_mm_blend_epi32`
-* [ ] `_mm256_blend_epi32`
-* [x] `_mm256_blendv_epi8`
-* [ ] `_mm_broadcastb_epi8`
-* [ ] `_mm256_broadcastb_epi8`
-* [ ] `_mm_broadcastd_epi32`
-* [ ] `_mm256_broadcastd_epi32`
-* [ ] `_mm_broadcastq_epi64`
-* [ ] `_mm256_broadcastq_epi64`
-* [ ] `_mm_broadcastsd_pd`
-* [ ] `_mm256_broadcastsd_pd`
-* [ ] `_mm_broadcastsi128_si256`
-* [ ] `_mm256_broadcastsi128_si256`
-* [ ] `_mm_broadcastss_ps`
-* [ ] `_mm256_broadcastss_ps`
-* [ ] `_mm_broadcastw_epi16`
-* [ ] `_mm256_broadcastw_epi16`
-* [x] `_mm256_cmpeq_epi8`
-* [x] `_mm256_cmpeq_epi16`
-* [x] `_mm256_cmpeq_epi32`
-* [x] `_mm256_cmpeq_epi64`
-* [x] `_mm256_cmpgt_epi8`
-* [x] `_mm256_cmpgt_epi16`
-* [x] `_mm256_cmpgt_epi32`
-* [x] `_mm256_cmpgt_epi64`
-* [ ] `_mm256_cvtepi16_epi32`
-* [ ] `_mm256_cvtepi16_epi64`
-* [ ] `_mm256_cvtepi32_epi64`
-* [ ] `_mm256_cvtepi8_epi16`
-* [ ] `_mm256_cvtepi8_epi32`
-* [ ] `_mm256_cvtepi8_epi64`
-* [ ] `_mm256_cvtepu16_epi32`
-* [ ] `_mm256_cvtepu16_epi64`
-* [ ] `_mm256_cvtepu32_epi64`
-* [ ] `_mm256_cvtepu8_epi16`
-* [ ] `_mm256_cvtepu8_epi32`
-* [ ] `_mm256_cvtepu8_epi64`
-* [ ] `_mm256_extracti128_si256`
-* [x] `_mm256_hadd_epi16`
-* [x] `_mm256_hadd_epi32`
-* [x] `_mm256_hadds_epi16`
-* [x] `_mm256_hsub_epi16`
-* [x] `_mm256_hsub_epi32`
-* [x] `_mm256_hsubs_epi16`
-* [ ] `_mm_i32gather_pd`
-* [ ] `_mm256_i32gather_pd`
-* [ ] `_mm_i32gather_ps`
-* [ ] `_mm256_i32gather_ps`
-* [ ] `_mm_i32gather_epi32`
-* [ ] `_mm256_i32gather_epi32`
-* [ ] `_mm_i32gather_epi64`
-* [ ] `_mm256_i32gather_epi64`
-* [ ] `_mm_i64gather_pd`
-* [ ] `_mm256_i64gather_pd`
-* [ ] `_mm_i64gather_ps`
-* [ ] `_mm256_i64gather_ps`
-* [ ] `_mm_i64gather_epi32`
-* [ ] `_mm256_i64gather_epi32`
-* [ ] `_mm_i64gather_epi64`
-* [ ] `_mm256_i64gather_epi64`
-* [ ] `_mm256_inserti128_si256`
-* [x] `_mm256_madd_epi16`
-* [x] `_mm256_maddubs_epi16`
-* [ ] `_mm_mask_i32gather_pd`
-* [ ] `_mm256_mask_i32gather_pd`
-* [ ] `_mm_mask_i32gather_ps`
-* [ ] `_mm256_mask_i32gather_ps`
-* [ ] `_mm_mask_i32gather_epi32`
-* [ ] `_mm256_mask_i32gather_epi32`
-* [ ] `_mm_mask_i32gather_epi64`
-* [ ] `_mm256_mask_i32gather_epi64`
-* [ ] `_mm_mask_i64gather_pd`
-* [ ] `_mm256_mask_i64gather_pd`
-* [ ] `_mm_mask_i64gather_ps`
-* [ ] `_mm256_mask_i64gather_ps`
-* [ ] `_mm_mask_i64gather_epi32`
-* [ ] `_mm256_mask_i64gather_epi32`
-* [ ] `_mm_mask_i64gather_epi64`
-* [ ] `_mm256_mask_i64gather_epi64`
-* [ ] `_mm_maskload_epi32`
-* [ ] `_mm256_maskload_epi32`
-* [ ] `_mm_maskload_epi64`
-* [ ] `_mm256_maskload_epi64`
-* [ ] `_mm_maskstore_epi32`
-* [ ] `_mm256_maskstore_epi32`
-* [ ] `_mm_maskstore_epi64`
-* [ ] `_mm256_maskstore_epi64`
-* [x] `_mm256_max_epi8`
-* [x] `_mm256_max_epi16`
-* [x] `_mm256_max_epi32`
-* [x] `_mm256_max_epu8`
-* [x] `_mm256_max_epu16`
-* [x] `_mm256_max_epu32`
-* [x] `_mm256_min_epi8`
-* [x] `_mm256_min_epi16`
-* [x] `_mm256_min_epi32`
-* [x] `_mm256_min_epu8`
-* [x] `_mm256_min_epu16`
-* [x] `_mm256_min_epu32`
-* [ ] `_mm256_movemask_epi8`
-* [ ] `_mm256_mpsadbw_epu8`
-* [x] `_mm256_mul_epi32`
-* [x] `_mm256_mul_epu32`
-* [x] `_mm256_mulhi_epi16`
-* [x] `_mm256_mulhi_epu16`
-* [x] `_mm256_mulhrs_epi16`
-* [x] `_mm256_mullo_epi16`
-* [x] `_mm256_mullo_epi32`
-* [x] `_mm256_or_si256`
-* [x] `_mm256_packs_epi16`
-* [x] `_mm256_packs_epi32`
-* [x] `_mm256_packus_epi16`
-* [x] `_mm256_packus_epi32`
-* [ ] `_mm256_permute2x128_si256`
-* [ ] `_mm256_permute4x64_epi64`
-* [ ] `_mm256_permute4x64_pd`
-* [ ] `_mm256_permutevar8x32_epi32`
-* [ ] `_mm256_permutevar8x32_ps`
-* [x] `_mm256_sad_epu8`
-* [ ] `_mm256_shuffle_epi32`
-* [ ] `_mm256_shuffle_epi8`
-* [ ] `_mm256_shufflehi_epi16`
-* [ ] `_mm256_shufflelo_epi16`
-* [x] `_mm256_sign_epi8`
-* [x] `_mm256_sign_epi16`
-* [x] `_mm256_sign_epi32`
-* [ ] `_mm256_slli_si256`
-* [ ] `_mm256_bslli_epi128`
-* [x] `_mm256_sll_epi16`
-* [x] `_mm256_slli_epi16`
-* [x] `_mm256_sll_epi32`
-* [x] `_mm256_slli_epi32`
-* [x] `_mm256_sll_epi64`
-* [x] `_mm256_slli_epi64`
-* [x] `_mm_sllv_epi32`
-* [x] `_mm256_sllv_epi32`
-* [x] `_mm_sllv_epi64`
-* [x] `_mm256_sllv_epi64`
-* [x] `_mm256_sra_epi16`
-* [x] `_mm256_srai_epi16`
-* [x] `_mm256_sra_epi32`
-* [x] `_mm256_srai_epi32`
-* [x] `_mm_srav_epi32`
-* [x] `_mm256_srav_epi32`
-* [x] `_mm256_srli_si256`
-* [ ] `_mm256_bsrli_epi128`
-* [x] `_mm256_srl_epi16`
-* [x] `_mm256_srli_epi16`
-* [x] `_mm256_srl_epi32`
-* [x] `_mm256_srli_epi32`
-* [x] `_mm256_srl_epi64`
-* [x] `_mm256_srli_epi64`
-* [x] `_mm_srlv_epi32`
-* [x] `_mm256_srlv_epi32`
-* [x] `_mm_srlv_epi64`
-* [x] `_mm256_srlv_epi64`
-* [ ] `_mm256_stream_load_si256`
-* [x] `_mm256_sub_epi8`
-* [x] `_mm256_sub_epi16`
-* [x] `_mm256_sub_epi32`
-* [x] `_mm256_sub_epi64`
-* [x] `_mm256_subs_epi8`
-* [x] `_mm256_subs_epi16`
-* [x] `_mm256_subs_epu8`
-* [x] `_mm256_subs_epu16`
-* [x] `_mm256_xor_si256`
-* [ ] `_mm256_unpackhi_epi8`
-* [ ] `_mm256_unpackhi_epi16`
-* [ ] `_mm256_unpackhi_epi32`
-* [ ] `_mm256_unpackhi_epi64`
-* [ ] `_mm256_unpacklo_epi8`
-* [ ] `_mm256_unpacklo_epi16`
-* [ ] `_mm256_unpacklo_epi32`
-* [ ] `_mm256_unpacklo_epi64`
@@ -4,7 +4,7 @@
 //! Manual](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0419c/index.html).

 #[cfg(test)]
-use assert_instr::assert_instr;
+use stdsimd_test::assert_instr;

 /// Reverse the order of the bytes.
 #[inline(always)]
@@ -6,7 +6,7 @@
 pub use super::v6::*;

 #[cfg(test)]
-use assert_instr::assert_instr;
+use stdsimd_test::assert_instr;

 /// Count Leading Zeros.
 #[inline(always)]
@@ -5,7 +5,7 @@
 pub use super::v7::*;

 #[cfg(test)]
-use assert_instr::assert_instr;
+use stdsimd_test::assert_instr;

 /// Reverse the order of the bytes.
 #[inline(always)]
@@ -89,7 +89,7 @@
 #![cfg_attr(test, feature(proc_macro))]

 #[cfg(test)]
-extern crate assert_instr;
+extern crate stdsimd_test;

 /// Platform independent SIMD vector types and operations.
 pub mod simd {
@@ -23,12 +23,12 @@ macro_rules! define_impl {
        $($elname:ident),+
    ) => {
        impl $name {
-            #[inline]
+            #[inline(always)]
            pub fn new($($elname: $elemty),*) -> $name {
                $name($($elname),*)
            }

-            #[inline]
+            #[inline(always)]
            pub fn splat(value: $elemty) -> $name {
                $name($({
                    #[allow(non_camel_case_types, dead_code)]
@@ -37,25 +37,25 @@ pub fn splat(value: $elemty) -> $name {
                }),*)
            }

-            #[inline]
+            #[inline(always)]
            pub fn extract(self, idx: u32) -> $elemty {
                assert!(idx < $nelems);
                unsafe { simd_extract(self, idx) }
            }

-            #[inline]
+            #[inline(always)]
            pub fn replace(self, idx: u32, val: $elemty) -> $name {
                assert!(idx < $nelems);
                unsafe { simd_insert(self, idx, val) }
            }

-            #[inline]
+            #[inline(always)]
            pub fn store(self, slice: &mut [$elemty], offset: usize) {
                assert!(slice[offset..].len() >= $nelems);
                unsafe { self.store_unchecked(slice, offset) }
            }

-            #[inline]
+            #[inline(always)]
            pub unsafe fn store_unchecked(
                self,
                slice: &mut [$elemty],
@@ -70,13 +70,13 @@ pub unsafe fn store_unchecked(
                    size_of::<$name>());
            }

-            #[inline]
+            #[inline(always)]
            pub fn load(slice: &[$elemty], offset: usize) -> $name {
                assert!(slice[offset..].len() >= $nelems);
                unsafe { $name::load_unchecked(slice, offset) }
            }

-            #[inline]
+            #[inline(always)]
            pub unsafe fn load_unchecked(
                slice: &[$elemty],
                offset: usize,
@@ -92,32 +92,32 @@ pub unsafe fn load_unchecked(
                x
            }

-            #[inline]
+            #[inline(always)]
            pub fn eq(self, other: $name) -> $boolname {
                unsafe { simd_eq(self, other) }
            }

-            #[inline]
+            #[inline(always)]
            pub fn ne(self, other: $name) -> $boolname {
                unsafe { simd_ne(self, other) }
            }

-            #[inline]
+            #[inline(always)]
            pub fn lt(self, other: $name) -> $boolname {
                unsafe { simd_lt(self, other) }
            }

-            #[inline]
+            #[inline(always)]
            pub fn le(self, other: $name) -> $boolname {
                unsafe { simd_le(self, other) }
            }

-            #[inline]
+            #[inline(always)]
            pub fn gt(self, other: $name) -> $boolname {
                unsafe { simd_gt(self, other) }
            }

-            #[inline]
+            #[inline(always)]
            pub fn ge(self, other: $name) -> $boolname {
                unsafe { simd_ge(self, other) }
            }
@@ -129,6 +129,7 @@ macro_rules! define_from {
    ($to:ident, $($from:ident),+) => {
        $(
            impl From<$from> for $to {
+                #[inline(always)]
                fn from(f: $from) -> $to {
                    unsafe { ::std::mem::transmute(f) }
                }
@@ -259,7 +260,7 @@ macro_rules! define_casts {
    ($(($fromty:ident, $toty:ident, $cast:ident)),+) => {
        $(
            impl $fromty {
-                #[inline]
+                #[inline(always)]
                pub fn $cast(self) -> ::simd::$toty {
                    unsafe { simd_cast(self) }
                }
@@ -11,7 +11,7 @@
 //! provides a quick overview of the instructions available.

 #[cfg(test)]
-use assert_instr::assert_instr;
+use stdsimd_test::assert_instr;

 /// Counts the leading most significant zero bits.
 ///
@@ -41,30 +41,28 @@ pub fn _popcnt32(x: u32) -> u32 { x.count_ones() }
 #[cfg_attr(test, assert_instr(popcnt))]
 pub fn _popcnt64(x: u64) -> u64 { x.count_ones() as u64 }

-#[cfg(all(test, target_feature = "bmi", any(target_arch = "x86", target_arch = "x86_64")))]
+#[cfg(test)]
 mod tests {
+    use stdsimd_test::simd_test;
+
    use x86::abm;

-    #[test]
-    #[target_feature = "+lzcnt"]
+    #[simd_test = "lzcnt"]
    fn _lzcnt_u32() {
        assert_eq!(abm::_lzcnt_u32(0b0101_1010u32), 25u32);
    }

-    #[test]
-    #[target_feature = "+lzcnt"]
+    #[simd_test = "lzcnt"]
    fn _lzcnt_u64() {
        assert_eq!(abm::_lzcnt_u64(0b0101_1010u64), 57u64);
    }

-    #[test]
-    #[target_feature = "+popcnt"]
+    #[simd_test = "popcnt"]
    fn _popcnt32() {
        assert_eq!(abm::_popcnt32(0b0101_1010u32), 4);
    }

-    #[test]
-    #[target_feature = "+popcnt"]
+    #[simd_test = "popcnt"]
    fn _popcnt64() {
        assert_eq!(abm::_popcnt64(0b0101_1010u64), 4);
    }
@@ -94,38 +94,14 @@ pub fn _mm256_floor_pd(a: f64x4) -> f64x4 {
    fn roundpd256(a: f64x4, b: i32) -> f64x4;
 }

-// Function stubs: work around assert_instr issues in expanded forms
-// ref: https://github.com/rust-lang-nursery/stdsimd/issues/49
-// ref: https://github.com/rust-lang-nursery/stdsimd/issues/47
-
-// #[cfg(test)]
-// #[target_feature = "+avx"]
-// #[cfg_attr(test, assert_instr(vroundpd))]
-// pub fn _mm256_round_pd_auto(a: f64x4, b: i32) -> f64x4 {
-//     return _mm256_round_pd(a, b);
-// }
-
-// #[cfg(test)]
-// #[target_feature = "+avx"]
-// #[cfg_attr(test, assert_instr(vroundpd))]
-// pub fn _mm256_ceil_pd_auto(a: f64x4) -> f64x4 {
-//     return _mm256_ceil_pd(a);
-// }
-
-// #[cfg(test)]
-// #[target_feature = "+avx"]
-// #[cfg_attr(test, assert_instr(vroundpd))]
-// pub fn _mm256_floor_pd_auto(a: f64x4) -> f64x4 {
-//     return _mm256_floor_pd(a);
-// }
-
 #[cfg(all(test, target_feature = "avx", any(target_arch = "x86", target_arch = "x86_64")))]
 mod tests {
+    use stdsimd_test::simd_test;
+
    use v256::*;
    use x86::avx;

-    #[test]
-    #[target_feature = "+avx"]
+    #[simd_test = "avx"]
    fn _mm256_add_pd() {
        let a = f64x4::new(1.0, 2.0, 3.0, 4.0);
        let b = f64x4::new(5.0, 6.0, 7.0, 8.0);
@@ -134,8 +110,7 @@ fn _mm256_add_pd() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature = "+avx"]
+    #[simd_test = "avx"]
    fn _mm256_add_ps() {
        let a = f32x8::new(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
        let b = f32x8::new(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
@@ -144,8 +119,7 @@ fn _mm256_add_ps() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature = "+avx"]
+    #[simd_test = "avx"]
    fn _mm256_addsub_pd() {
        let a = f64x4::new(1.0, 2.0, 3.0, 4.0);
        let b = f64x4::new(5.0, 6.0, 7.0, 8.0);
@@ -2,9 +2,13 @@
 use v128::*;
 use x86::__m256i;

+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
 /// Computes the absolute values of packed 32-bit integers in `a`.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpabsd))]
 pub fn _mm256_abs_epi32(a: i32x8) -> i32x8 {
    unsafe { pabsd(a) }
 }
@@ -12,6 +16,7 @@ pub fn _mm256_abs_epi32(a: i32x8) -> i32x8 {
 /// Computes the absolute values of packed 16-bit integers in `a`.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpabsw))]
 pub fn _mm256_abs_epi16(a: i16x16) -> i16x16 {
    unsafe { pabsw(a) }
 }
@@ -19,6 +24,7 @@ pub fn _mm256_abs_epi16(a: i16x16) -> i16x16 {
 /// Computes the absolute values of packed 8-bit integers in `a`.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpabsb))]
 pub fn _mm256_abs_epi8(a: i8x32) -> i8x32 {
    unsafe { pabsb(a) }
 }
@@ -26,6 +32,7 @@ pub fn _mm256_abs_epi8(a: i8x32) -> i8x32 {
 /// Add packed 64-bit integers in `a` and `b`.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpaddq))]
 pub fn _mm256_add_epi64(a: i64x4, b: i64x4) -> i64x4 {
    a + b
 }
@@ -33,6 +40,7 @@ pub fn _mm256_add_epi64(a: i64x4, b: i64x4) -> i64x4 {
 /// Add packed 32-bit integers in `a` and `b`.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpaddd))]
 pub fn _mm256_add_epi32(a: i32x8, b: i32x8) -> i32x8 {
    a + b
 }
@@ -40,6 +48,7 @@ pub fn _mm256_add_epi32(a: i32x8, b: i32x8) -> i32x8 {
 /// Add packed 16-bit integers in `a` and `b`.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpaddw))]
 pub fn _mm256_add_epi16(a: i16x16, b: i16x16) -> i16x16 {
    a + b
 }
@@ -47,6 +56,7 @@ pub fn _mm256_add_epi16(a: i16x16, b: i16x16) -> i16x16 {
 /// Add packed 8-bit integers in `a` and `b`.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpaddb))]
 pub fn _mm256_add_epi8(a: i8x32, b: i8x32) -> i8x32 {
    a + b
 }
@@ -54,6 +64,7 @@ pub fn _mm256_add_epi8(a: i8x32, b: i8x32) -> i8x32 {
 /// Add packed 8-bit integers in `a` and `b` using saturation.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpaddsb))]
 pub fn _mm256_adds_epi8(a: i8x32, b: i8x32) -> i8x32 {
    unsafe { paddsb(a, b) }
 }
@@ -61,6 +72,7 @@ pub fn _mm256_adds_epi8(a: i8x32, b: i8x32) -> i8x32 {
 /// Add packed 16-bit integers in `a` and `b` using saturation.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpaddsw))]
 pub fn _mm256_adds_epi16(a: i16x16, b: i16x16) -> i16x16 {
    unsafe { paddsw(a, b) }
 }
@@ -68,6 +80,7 @@ pub fn _mm256_adds_epi16(a: i16x16, b: i16x16) -> i16x16 {
 /// Add packed unsigned 8-bit integers in `a` and `b` using saturation.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpaddusb))]
 pub fn _mm256_adds_epu8(a: u8x32, b: u8x32) -> u8x32 {
    unsafe { paddusb(a, b) }
 }
@@ -75,6 +88,7 @@ pub fn _mm256_adds_epu8(a: u8x32, b: u8x32) -> u8x32 {
 /// Add packed unsigned 16-bit integers in `a` and `b` using saturation.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpaddusw))]
 pub fn _mm256_adds_epu16(a: u16x16, b: u16x16) -> u16x16 {
    unsafe { paddusw(a, b) }
 }
@@ -85,6 +99,7 @@ pub fn _mm256_adds_epu16(a: u16x16, b: u16x16) -> u16x16 {
 /// in `a` and `b`.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vandps))]
 pub fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
    a & b
 }
@@ -93,6 +108,7 @@ pub fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
 /// in `a` and then AND with `b`.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vandnps))]
 pub fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
    (!a) & b
 }
@@ -100,6 +116,7 @@ pub fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
 /// Average packed unsigned 16-bit integers in `a` and `b`.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpavgw))]
 pub fn _mm256_avg_epu16 (a: u16x16, b: u16x16) -> u16x16 {
    unsafe { pavgw(a, b) }
 }
@@ -107,6 +124,7 @@ pub fn _mm256_avg_epu16 (a: u16x16, b: u16x16) -> u16x16 {
 /// Average packed unsigned 8-bit integers in `a` and `b`.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpavgb))]
 pub fn _mm256_avg_epu8 (a: u8x32, b: u8x32) -> u8x32 {
    unsafe { pavgb(a, b) }
 }
@@ -118,6 +136,7 @@ pub fn _mm256_avg_epu8 (a: u8x32, b: u8x32) -> u8x32 {
 /// Blend packed 8-bit integers from `a` and `b` using `mask`.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpblendvb))]
 pub fn _mm256_blendv_epi8(a:i8x32,b:i8x32,mask:__m256i) -> i8x32 {
    unsafe { pblendvb(a,b,mask) }
 }
@@ -143,6 +162,7 @@ pub fn _mm256_blendv_epi8(a:i8x32,b:i8x32,mask:__m256i) -> i8x32 {
 /// Compare packed 64-bit integers in `a` and `b` for equality.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpcmpeqq))]
 pub fn _mm256_cmpeq_epi64(a: i64x4, b: i64x4) -> i64x4 {
    a.eq(b)
 }
@@ -150,6 +170,7 @@ pub fn _mm256_cmpeq_epi64(a: i64x4, b: i64x4) -> i64x4 {
 /// Compare packed 32-bit integers in `a` and `b` for equality.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpcmpeqd))]
 pub fn _mm256_cmpeq_epi32(a: i32x8, b: i32x8) -> i32x8 {
    a.eq(b)
 }
@@ -157,6 +178,7 @@ pub fn _mm256_cmpeq_epi32(a: i32x8, b: i32x8) -> i32x8 {
 /// Compare packed 16-bit integers in `a` and `b` for equality.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpcmpeqw))]
 pub fn _mm256_cmpeq_epi16(a: i16x16, b: i16x16) -> i16x16 {
    a.eq(b)
 }
@@ -164,6 +186,7 @@ pub fn _mm256_cmpeq_epi16(a: i16x16, b: i16x16) -> i16x16 {
 /// Compare packed 8-bit integers in `a` and `b` for equality.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpcmpeqb))]
 pub fn _mm256_cmpeq_epi8(a: i8x32, b: i8x32) -> i8x32 {
    a.eq(b)
 }
@@ -171,6 +194,7 @@ pub fn _mm256_cmpeq_epi8(a: i8x32, b: i8x32) -> i8x32 {
 /// Compare packed 64-bit integers in `a` and `b` for greater-than.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpcmpgtq))]
 pub fn _mm256_cmpgt_epi64(a: i64x4, b: i64x4) -> i64x4 {
    a.gt(b)
 }
@@ -178,6 +202,7 @@ pub fn _mm256_cmpgt_epi64(a: i64x4, b: i64x4) -> i64x4 {
 /// Compare packed 32-bit integers in `a` and `b` for greater-than.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpcmpgtd))]
 pub fn _mm256_cmpgt_epi32(a: i32x8, b: i32x8) -> i32x8 {
    a.gt(b)
 }
@@ -185,6 +210,7 @@ pub fn _mm256_cmpgt_epi32(a: i32x8, b: i32x8) -> i32x8 {
 /// Compare packed 16-bit integers in `a` and `b` for greater-than.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpcmpgtw))]
 pub fn _mm256_cmpgt_epi16(a: i16x16, b: i16x16) -> i16x16 {
    a.gt(b)
 }
@@ -192,6 +218,7 @@ pub fn _mm256_cmpgt_epi16(a: i16x16, b: i16x16) -> i16x16 {
 /// Compare packed 8-bit integers in `a` and `b` for greater-than.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpcmpgtb))]
 pub fn _mm256_cmpgt_epi8(a: i8x32, b: i8x32) -> i8x32 {
    a.gt(b)
 }
@@ -213,6 +240,7 @@ pub fn _mm256_cmpgt_epi8(a: i8x32, b: i8x32) -> i8x32 {
 /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vphaddw))]
 pub fn _mm256_hadd_epi16(a: i16x16, b: i16x16) -> i16x16 {
    unsafe { phaddw(a, b) }
 }
@@ -220,6 +248,7 @@ pub fn _mm256_hadd_epi16(a: i16x16, b: i16x16) -> i16x16 {
 /// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vphaddd))]
 pub fn _mm256_hadd_epi32(a: i32x8, b: i32x8) -> i32x8 {
    unsafe { phaddd(a, b) }
 }
@@ -228,6 +257,7 @@ pub fn _mm256_hadd_epi32(a: i32x8, b: i32x8) -> i32x8 {
 /// using saturation.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vphaddsw))]
 pub fn _mm256_hadds_epi16(a: i16x16, b: i16x16) -> i16x16 {
    unsafe { phaddsw(a, b) }
 }
@@ -235,6 +265,7 @@ pub fn _mm256_hadds_epi16(a: i16x16, b: i16x16) -> i16x16 {
 /// Horizontally substract adjacent pairs of 16-bit integers in `a` and `b`.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vphsubw))]
 pub fn _mm256_hsub_epi16(a: i16x16, b: i16x16) -> i16x16 {
    unsafe { phsubw(a, b) }
 }
@@ -242,6 +273,7 @@ pub fn _mm256_hsub_epi16(a: i16x16, b: i16x16) -> i16x16 {
 /// Horizontally substract adjacent pairs of 32-bit integers in `a` and `b`.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vphsubd))]
 pub fn _mm256_hsub_epi32(a: i32x8, b: i32x8) -> i32x8 {
    unsafe { phsubd(a, b) }
 }
@@ -250,6 +282,7 @@ pub fn _mm256_hsub_epi32(a: i32x8, b: i32x8) -> i32x8 {
 /// using saturation.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vphsubsw))]
 pub fn _mm256_hsubs_epi16(a: i16x16, b: i16x16) -> i16x16 {
    unsafe { phsubsw(a, b) }
 }
@@ -294,6 +327,7 @@ pub fn _mm256_hsubs_epi16(a: i16x16, b: i16x16) -> i16x16 {
 /// of intermediate 32-bit integers.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpmaddwd))]
 pub fn _mm256_madd_epi16(a: i16x16, b: i16x16) -> i32x8 {
    unsafe { pmaddwd(a, b) }
 }
@@ -304,6 +338,7 @@ pub fn _mm256_madd_epi16(a: i16x16, b: i16x16) -> i32x8 {
 /// signed 16-bit integers
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpmaddubsw))]
 pub fn _mm256_maddubs_epi16(a: u8x32, b: u8x32) -> i16x16 {
    unsafe { pmaddubsw(a, b) }
 }
@@ -321,6 +356,7 @@ pub fn _mm256_maddubs_epi16(a: u8x32, b: u8x32) -> i16x16 {
 /// maximum values.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpmaxsw))]
 pub fn _mm256_max_epi16(a: i16x16, b: i16x16) -> i16x16 {
    unsafe { pmaxsw(a, b) }
 }
@@ -329,6 +365,7 @@ pub fn _mm256_max_epi16(a: i16x16, b: i16x16) -> i16x16 {
 /// maximum values.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpmaxsd))]
 pub fn _mm256_max_epi32(a: i32x8, b: i32x8) -> i32x8 {
    unsafe { pmaxsd(a, b) }
 }
@@ -337,6 +374,7 @@ pub fn _mm256_max_epi32(a: i32x8, b: i32x8) -> i32x8 {
 /// maximum values.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpmaxsb))]
 pub fn _mm256_max_epi8(a: i8x32, b: i8x32) -> i8x32 {
    unsafe { pmaxsb(a, b) }
 }
@@ -345,6 +383,7 @@ pub fn _mm256_max_epi8(a: i8x32, b: i8x32) -> i8x32 {
 /// the packed maximum values.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpmaxuw))]
 pub fn _mm256_max_epu16(a: u16x16, b: u16x16) -> u16x16 {
    unsafe { pmaxuw(a, b) }
 }
@@ -353,6 +392,7 @@ pub fn _mm256_max_epu16(a: u16x16, b: u16x16) -> u16x16 {
 /// the packed maximum values.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpmaxud))]
 pub fn _mm256_max_epu32(a: u32x8, b: u32x8) -> u32x8 {
    unsafe { pmaxud(a, b) }
 }
@@ -361,6 +401,7 @@ pub fn _mm256_max_epu32(a: u32x8, b: u32x8) -> u32x8 {
 /// the packed maximum values.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpmaxub))]
 pub fn _mm256_max_epu8(a: u8x32, b: u8x32) -> u8x32 {
    unsafe { pmaxub(a, b) }
 }
@@ -369,6 +410,7 @@ pub fn _mm256_max_epu8(a: u8x32, b: u8x32) -> u8x32 {
 /// minimum values.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpminsw))]
 pub fn _mm256_min_epi16(a: i16x16, b: i16x16) -> i16x16 {
    unsafe { pminsw(a, b) }
 }
@@ -377,6 +419,7 @@ pub fn _mm256_min_epi16(a: i16x16, b: i16x16) -> i16x16 {
 /// minimum values.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpminsd))]
 pub fn _mm256_min_epi32(a: i32x8, b: i32x8) -> i32x8 {
    unsafe { pminsd(a, b) }
 }
@@ -385,6 +428,7 @@ pub fn _mm256_min_epi32(a: i32x8, b: i32x8) -> i32x8 {
 /// minimum values.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpminsb))]
 pub fn _mm256_min_epi8(a: i8x32, b: i8x32) -> i8x32 {
    unsafe { pminsb(a, b) }
 }
@@ -393,6 +437,7 @@ pub fn _mm256_min_epi8(a: i8x32, b: i8x32) -> i8x32 {
 /// the packed minimum values.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpminuw))]
 pub fn _mm256_min_epu16(a: u16x16, b: u16x16) -> u16x16 {
    unsafe { pminuw(a, b) }
 }
@@ -401,6 +446,7 @@ pub fn _mm256_min_epu16(a: u16x16, b: u16x16) -> u16x16 {
 /// the packed minimum values.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpminud))]
 pub fn _mm256_min_epu32(a: u32x8, b: u32x8) -> u32x8 {
    unsafe { pminud(a, b) }
 }
@@ -409,6 +455,7 @@ pub fn _mm256_min_epu32(a: u32x8, b: u32x8) -> u32x8 {
 /// the packed minimum values.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpminub))]
 pub fn _mm256_min_epu8(a: u8x32, b: u8x32) -> u8x32 {
    unsafe { pminub(a, b) }
 }
@@ -444,6 +491,7 @@ pub fn _mm256_mpsadbw_epu8(a: u8x32, b: u8x32, imm8: i32) -> u16x16 {
 /// Return the 64-bit results.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpmuldq))]
 pub fn _mm256_mul_epi32(a: i32x8, b: i32x8) -> i64x4 {
    unsafe { pmuldq(a, b) }
 }
@@ -454,6 +502,7 @@ pub fn _mm256_mul_epi32(a: i32x8, b: i32x8) -> i64x4 {
 /// Return the unsigned 64-bit results.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpmuludq))]
 pub fn _mm256_mul_epu32(a: u32x8, b: u32x8) -> u64x4 {
    unsafe { pmuludq(a, b) }
 }
@@ -463,6 +512,7 @@ pub fn _mm256_mul_epu32(a: u32x8, b: u32x8) -> u64x4 {
 /// intermediate integers.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpmulhw))]
 pub fn _mm256_mulhi_epi16(a: i16x16, b: i16x16) -> i16x16 {
    unsafe { pmulhw(a, b) }
 }
@@ -472,6 +522,7 @@ pub fn _mm256_mulhi_epi16(a: i16x16, b: i16x16) -> i16x16 {
 /// intermediate integers.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpmulhuw))]
 pub fn _mm256_mulhi_epu16(a: u16x16, b: u16x16) -> u16x16 {
    unsafe { pmulhuw(a, b) }
 }
@@ -481,6 +532,7 @@ pub fn _mm256_mulhi_epu16(a: u16x16, b: u16x16) -> u16x16 {
 /// intermediate integers
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpmullw))]
 pub fn _mm256_mullo_epi16(a: i16x16, b:i16x16) -> i16x16 {
    a * b
 }
@@ -491,6 +543,7 @@ pub fn _mm256_mullo_epi16(a: i16x16, b:i16x16) -> i16x16 {
 /// intermediate integers
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpmulld))]
 pub fn _mm256_mullo_epi32(a: i32x8, b:i32x8) -> i32x8 {
    a * b
 }
@@ -501,6 +554,7 @@ pub fn _mm256_mullo_epi32(a: i32x8, b:i32x8) -> i32x8 {
 /// return bits [16:1]
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpmulhrsw))]
 pub fn _mm256_mulhrs_epi16(a: i16x16, b:i16x16) -> i16x16 {
    unsafe { pmulhrsw(a, b) }
 }
@@ -509,6 +563,7 @@ pub fn _mm256_mulhrs_epi16(a: i16x16, b:i16x16) -> i16x16 {
 /// and `b`
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vorps))]
 pub fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
    a | b
 }
@@ -517,6 +572,7 @@ pub fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
 /// using signed saturation
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpacksswb))]
 pub fn _mm256_packs_epi16(a: i16x16, b: i16x16) -> i8x32 {
    unsafe { packsswb(a, b) }
 }
@@ -525,6 +581,7 @@ pub fn _mm256_packs_epi16(a: i16x16, b: i16x16) -> i8x32 {
 /// using signed saturation
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpackssdw))]
 pub fn _mm256_packs_epi32(a: i32x8, b: i32x8) -> i16x16 {
    unsafe { packssdw(a, b) }
 }
@@ -533,6 +590,7 @@ pub fn _mm256_packs_epi32(a: i32x8, b: i32x8) -> i16x16 {
 /// using unsigned saturation
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpackuswb))]
 pub fn _mm256_packus_epi16(a: i16x16, b: i16x16) -> u8x32 {
    unsafe { packuswb(a, b) }
 }
@@ -541,6 +599,7 @@ pub fn _mm256_packus_epi16(a: i16x16, b: i16x16) -> u8x32 {
 /// using unsigned saturation
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpackusdw))]
 pub fn _mm256_packus_epi32(a: i32x8, b: i32x8) -> u16x16 {
    unsafe { packusdw(a, b) }
 }
@@ -557,6 +616,7 @@ pub fn _mm256_packus_epi32(a: i32x8, b: i32x8) -> u16x16 {
 /// integers in the low 16 bits of the 64-bit return value
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpsadbw))]
 pub fn _mm256_sad_epu8 (a: u8x32, b: u8x32) -> u64x4 {
    unsafe { psadbw(a, b) }
 }
@@ -571,6 +631,7 @@ pub fn _mm256_sad_epu8 (a: u8x32, b: u8x32) -> u64x4 {
 /// Results are zeroed out when the corresponding element in `b` is zero.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpsignw))]
 pub fn _mm256_sign_epi16(a: i16x16, b: i16x16) -> i16x16 {
    unsafe { psignw(a, b) }
 }
@@ -580,6 +641,7 @@ pub fn _mm256_sign_epi16(a: i16x16, b: i16x16) -> i16x16 {
 /// Results are zeroed out when the corresponding element in `b` is zero.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpsignd))]
 pub fn _mm256_sign_epi32(a: i32x8, b: i32x8) -> i32x8 {
    unsafe { psignd(a, b) }
 }
@@ -589,6 +651,7 @@ pub fn _mm256_sign_epi32(a: i32x8, b: i32x8) -> i32x8 {
 /// Results are zeroed out when the corresponding element in `b` is zero.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpsignb))]
 pub fn _mm256_sign_epi8(a: i8x32, b: i8x32) -> i8x32 {
    unsafe { psignb(a, b) }
 }
@@ -597,6 +660,7 @@ pub fn _mm256_sign_epi8(a: i8x32, b: i8x32) -> i8x32 {
 /// shifting in zeros, and return the result
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpsllw))]
 pub fn _mm256_sll_epi16(a: i16x16, count: i16x8) -> i16x16 {
    unsafe { psllw(a, count) }
 }
@@ -605,6 +669,7 @@ pub fn _mm256_sll_epi16(a: i16x16, count: i16x8) -> i16x16 {
 /// shifting in zeros, and return the result
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpslld))]
 pub fn _mm256_sll_epi32(a: i32x8, count: i32x4) -> i32x8 {
    unsafe { pslld(a, count) }
 }
@@ -613,6 +678,7 @@ pub fn _mm256_sll_epi32(a: i32x8, count: i32x4) -> i32x8 {
 /// shifting in zeros, and return the result
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpsllq))]
 pub fn _mm256_sll_epi64(a: i64x4, count: i64x2) -> i64x4 {
    unsafe { psllq(a, count) }
 }
@@ -621,6 +687,7 @@ pub fn _mm256_sll_epi64(a: i64x4, count: i64x2) -> i64x4 {
 /// shifting in zeros, return the results;
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpsllw))] // TODO: should this be pslli
 pub fn _mm256_slli_epi16(a: i16x16, imm8: i32) -> i16x16 {
    unsafe { pslliw(a, imm8) }
 }
@@ -629,6 +696,7 @@ pub fn _mm256_slli_epi16(a: i16x16, imm8: i32) -> i16x16 {
 /// shifting in zeros, return the results;
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpslld))] // TODO: should this be pslli
 pub fn _mm256_slli_epi32(a: i32x8, imm8: i32) -> i32x8 {
    unsafe { psllid(a, imm8) }
 }
@@ -637,6 +705,7 @@ pub fn _mm256_slli_epi32(a: i32x8, imm8: i32) -> i32x8 {
 /// shifting in zeros, return the results;
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpsllq))] // TODO: should this be pslli
 pub fn _mm256_slli_epi64(a: i64x4, imm8: i32) -> i64x4 {
    unsafe { pslliq(a, imm8) }
 }
@@ -648,6 +717,7 @@ pub fn _mm256_slli_epi64(a: i64x4, imm8: i32) -> i64x4 {
 /// shifting in zeros, and return the result.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpsllvd))]
 pub fn _mm_sllv_epi32(a: i32x4, count: i32x4) -> i32x4 {
    unsafe { psllvd(a, count) }
 }
@@ -657,6 +727,7 @@ pub fn _mm_sllv_epi32(a: i32x4, count: i32x4) -> i32x4 {
 /// shifting in zeros, and return the result.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpsllvd))]
 pub fn _mm256_sllv_epi32(a: i32x8, count: i32x8) -> i32x8 {
    unsafe { psllvd256(a, count) }
 }
@@ -666,6 +737,7 @@ pub fn _mm256_sllv_epi32(a: i32x8, count: i32x8) -> i32x8 {
 /// shifting in zeros, and return the result.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpsllvq))]
 pub fn _mm_sllv_epi64(a: i64x2, count: i64x2) -> i64x2 {
    unsafe { psllvq(a, count) }
 }
@@ -675,6 +747,7 @@ pub fn _mm_sllv_epi64(a: i64x2, count: i64x2) -> i64x2 {
 /// shifting in zeros, and return the result.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpsllvq))]
 pub fn _mm256_sllv_epi64(a: i64x4, count: i64x4) -> i64x4 {
    unsafe { psllvq256(a, count) }
 }
@@ -683,6 +756,7 @@ pub fn _mm256_sllv_epi64(a: i64x4, count: i64x4) -> i64x4 {
 /// shifting in sign bits.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpsraw))]
 pub fn _mm256_sra_epi16(a: i16x16, count: i16x8) -> i16x16 {
    unsafe { psraw(a, count) }
 }
@@ -691,6 +765,7 @@ pub fn _mm256_sra_epi16(a: i16x16, count: i16x8) -> i16x16 {
 /// shifting in sign bits.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpsrad))]
 pub fn _mm256_sra_epi32(a: i32x8, count: i32x4) -> i32x8 {
    unsafe { psrad(a, count) }
 }
@@ -699,6 +774,7 @@ pub fn _mm256_sra_epi32(a: i32x8, count: i32x4) -> i32x8 {
 /// shifting in sign bits.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpsraw))] // TODO: notvpsraiw?
 pub fn _mm256_srai_epi16(a: i16x16, imm8: i32) -> i16x16 {
    unsafe { psraiw(a, imm8) }
 }
@@ -707,6 +783,7 @@ pub fn _mm256_srai_epi16(a: i16x16, imm8: i32) -> i16x16 {
 /// shifting in sign bits.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpsrad))] // TODO: not vpsraid?
 pub fn _mm256_srai_epi32(a: i32x8, imm8: i32) -> i32x8 {
    unsafe { psraid(a, imm8) }
 }
@@ -715,6 +792,7 @@ pub fn _mm256_srai_epi32(a: i32x8, imm8: i32) -> i32x8 {
 /// corresponding element in `count` while shifting in sign bits.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpsravd))]
 pub fn _mm_srav_epi32(a: i32x4, count: i32x4) -> i32x4 {
    unsafe { psravd(a, count) }
 }
@@ -723,6 +801,7 @@ pub fn _mm_srav_epi32(a: i32x4, count: i32x4) -> i32x4 {
 /// corresponding element in `count` while shifting in sign bits.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpsravd))]
 pub fn _mm256_srav_epi32(a: i32x8, count: i32x8) -> i32x8 {
    unsafe { psravd256(a, count) }
 }
@@ -732,6 +811,7 @@ pub fn _mm256_srav_epi32(a: i32x8, count: i32x8) -> i32x8 {
 /// zeros.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpsrlw))]
 pub fn _mm256_srl_epi16(a: i16x16, count: i16x8) -> i16x16 {
    unsafe { psrlw(a, count) }
 }
@@ -740,6 +820,7 @@ pub fn _mm256_srl_epi16(a: i16x16, count: i16x8) -> i16x16 {
 /// zeros.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpsrld))]
 pub fn _mm256_srl_epi32(a: i32x8, count: i32x4) -> i32x8 {
    unsafe { psrld(a, count) }
 }
@@ -748,6 +829,7 @@ pub fn _mm256_srl_epi32(a: i32x8, count: i32x4) -> i32x8 {
 /// zeros.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpsrlq))]
 pub fn _mm256_srl_epi64(a: i64x4, count: i64x2) -> i64x4 {
    unsafe { psrlq(a, count) }
 }
@@ -756,6 +838,7 @@ pub fn _mm256_srl_epi64(a: i64x4, count: i64x2) -> i64x4 {
 /// zeros
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpsrlw))] // TODO not vpsrliw?
 pub fn _mm256_srli_epi16(a: i16x16, imm8: i32) -> i16x16 {
    unsafe { psrliw(a, imm8) }
 }
@@ -764,6 +847,7 @@ pub fn _mm256_srli_epi16(a: i16x16, imm8: i32) -> i16x16 {
 /// zeros
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpsrld))] // TODO: not vpsrlid?
 pub fn _mm256_srli_epi32(a: i32x8, imm8: i32) -> i32x8 {
    unsafe { psrlid(a, imm8) }
 }
@@ -772,6 +856,7 @@ pub fn _mm256_srli_epi32(a: i32x8, imm8: i32) -> i32x8 {
 /// zeros
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpsrlq))] // TODO: not vpsrliq?
 pub fn _mm256_srli_epi64(a: i64x4, imm8: i32) -> i64x4 {
    unsafe { psrliq(a, imm8) }
 }
@@ -780,6 +865,7 @@ pub fn _mm256_srli_epi64(a: i64x4, imm8: i32) -> i64x4 {
 /// the corresponding element in `count` while shifting in zeros,
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpsrlvd))]
 pub fn _mm_srlv_epi32(a: i32x4, count: i32x4) -> i32x4 {
    unsafe { psrlvd(a, count) }
 }
@@ -788,6 +874,7 @@ pub fn _mm_srlv_epi32(a: i32x4, count: i32x4) -> i32x4 {
 /// the corresponding element in `count` while shifting in zeros,
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpsrlvd))]
 pub fn _mm256_srlv_epi32(a: i32x8, count: i32x8) -> i32x8 {
    unsafe { psrlvd256(a, count) }
 }
@@ -796,6 +883,7 @@ pub fn _mm256_srlv_epi32(a: i32x8, count: i32x8) -> i32x8 {
 /// the corresponding element in `count` while shifting in zeros,
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpsrlvq))]
 pub fn _mm_srlv_epi64(a: i64x2, count: i64x2) -> i64x2 {
    unsafe { psrlvq(a, count) }
 }
@@ -804,6 +892,7 @@ pub fn _mm_srlv_epi64(a: i64x2, count: i64x2) -> i64x2 {
 /// the corresponding element in `count` while shifting in zeros,
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpsrlvq))]
 pub fn _mm256_srlv_epi64(a: i64x4, count: i64x4) -> i64x4 {
    unsafe { psrlvq256(a, count) }
 }
@@ -813,6 +902,7 @@ pub fn _mm256_srlv_epi64(a: i64x4, count: i64x4) -> i64x4 {
 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpsubw))]
 pub fn _mm256_sub_epi16(a: i16x16, b: i16x16) -> i16x16 {
    a - b
 }
@@ -820,6 +910,7 @@ pub fn _mm256_sub_epi16(a: i16x16, b: i16x16) -> i16x16 {
 /// Subtract packed 32-bit integers in `b` from packed 16-bit integers in `a`
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpsubd))]
 pub fn _mm256_sub_epi32(a: i32x8, b: i32x8) -> i32x8 {
    a - b
 }
@@ -827,6 +918,7 @@ pub fn _mm256_sub_epi32(a: i32x8, b: i32x8) -> i32x8 {
 /// Subtract packed 64-bit integers in `b` from packed 16-bit integers in `a`
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpsubq))]
 pub fn _mm256_sub_epi64(a: i64x4, b: i64x4) -> i64x4 {
    a - b
 }
@@ -834,6 +926,7 @@ pub fn _mm256_sub_epi64(a: i64x4, b: i64x4) -> i64x4 {
 /// Subtract packed 8-bit integers in `b` from packed 16-bit integers in `a`
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpsubb))]
 pub fn _mm256_sub_epi8(a: i8x32, b: i8x32) -> i8x32 {
    a - b
 }
@@ -842,6 +935,7 @@ pub fn _mm256_sub_epi8(a: i8x32, b: i8x32) -> i8x32 {
 /// `a` using saturation.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpsubsw))]
 pub fn _mm256_subs_epi16(a: i16x16, b: i16x16) -> i16x16 {
    unsafe { psubsw(a, b) }
 }
@@ -850,6 +944,7 @@ pub fn _mm256_subs_epi16(a: i16x16, b: i16x16) -> i16x16 {
 /// `a` using saturation.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpsubsb))]
 pub fn _mm256_subs_epi8(a: i8x32, b: i8x32) -> i8x32 {
    unsafe { psubsb(a, b) }
 }
@@ -858,6 +953,7 @@ pub fn _mm256_subs_epi8(a: i8x32, b: i8x32) -> i8x32 {
 /// integers in `a` using saturation.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpsubusw))]
 pub fn _mm256_subs_epu16(a: u16x16, b: u16x16) -> u16x16 {
    unsafe { psubusw(a, b) }
 }
@@ -866,6 +962,7 @@ pub fn _mm256_subs_epu16(a: u16x16, b: u16x16) -> u16x16 {
 /// integers in `a` using saturation.
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vpsubusb))]
 pub fn _mm256_subs_epu8(a: u8x32, b: u8x32) -> u8x32 {
    unsafe { psubusb(a, b) }
 }
@@ -883,6 +980,7 @@ pub fn _mm256_subs_epu8(a: u8x32, b: u8x32) -> u8x32 {
 /// in `a` and `b`
 #[inline(always)]
 #[target_feature = "+avx2"]
+#[cfg_attr(test, assert_instr(vxorps))]
 pub fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
    a ^ b
 }
@@ -1044,16 +1142,17 @@ pub fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
 }


-#[cfg(all(test, target_feature = "avx2", any(target_arch = "x86", target_arch = "x86_64")))]
+#[cfg(test)]
 mod tests {
+    use stdsimd_test::simd_test;
+
    use v256::*;
    use v128::*;
    use x86::avx2;
    use x86::__m256i;
    use std;

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_abs_epi32() {
        let a = i32x8::new(
            0, 1, -1, std::i32::MAX,
@@ -1065,8 +1164,7 @@ fn _mm256_abs_epi32() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_abs_epi16() {
        let a = i16x16::new(
            0, 1, -1, 2,
@@ -1082,8 +1180,7 @@ fn _mm256_abs_epi16() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_abs_epi8() {
        let a = i8x32::new(
            0, 1, -1, 2,
@@ -1103,8 +1200,7 @@ fn _mm256_abs_epi8() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_add_epi64() {
        let a = i64x4::new(-10, 0, 100, 1_000_000_000);
        let b = i64x4::new(-1, 0, 1, 2);
@@ -1113,8 +1209,7 @@ fn _mm256_add_epi64() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_add_epi32() {
        let a = i32x8::new(-1, 0, 1, 2, 3, 4, 5, 6);
        let b = i32x8::new(1, 2, 3, 4, 5, 6, 7, 8);
@@ -1123,8 +1218,7 @@ fn _mm256_add_epi32() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_add_epi16() {
        let a = i16x16::new(
            0, 1, 2, 3, 4, 5, 6, 7,
@@ -1139,8 +1233,7 @@ fn _mm256_add_epi16() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_add_epi8() {
        let a = i8x32::new(
            0, 1, 2, 3, 4, 5, 6, 7,
@@ -1161,8 +1254,7 @@ fn _mm256_add_epi8() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_adds_epi8() {
        let a = i8x32::new(
            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
@@ -1177,8 +1269,7 @@ fn _mm256_adds_epi8() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_adds_epi8_saturate_positive() {
        let a = i8x32::splat(0x7F);
        let b = i8x32::splat(1);
@@ -1186,8 +1277,7 @@ fn _mm256_adds_epi8_saturate_positive() {
        assert_eq!(r, a);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_adds_epi8_saturate_negative() {
        let a = i8x32::splat(-0x80);
        let b = i8x32::splat(-1);
@@ -1195,8 +1285,7 @@ fn _mm256_adds_epi8_saturate_negative() {
        assert_eq!(r, a);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_adds_epi16() {
        let a = i16x16::new(
            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
@@ -1209,8 +1298,7 @@ fn _mm256_adds_epi16() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_adds_epi16_saturate_positive() {
        let a = i16x16::splat(0x7FFF);
        let b = i16x16::splat(1);
@@ -1218,8 +1306,7 @@ fn _mm256_adds_epi16_saturate_positive() {
        assert_eq!(r, a);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_adds_epi16_saturate_negative() {
        let a = i16x16::splat(-0x8000);
        let b = i16x16::splat(-1);
@@ -1227,8 +1314,7 @@ fn _mm256_adds_epi16_saturate_negative() {
        assert_eq!(r, a);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_adds_epu8() {
        let a = u8x32::new(
            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
@@ -1243,8 +1329,7 @@ fn _mm256_adds_epu8() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_adds_epu8_saturate() {
        let a = u8x32::splat(0xFF);
        let b = u8x32::splat(1);
@@ -1253,8 +1338,7 @@ fn _mm256_adds_epu8_saturate() {
    }


-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_adds_epu16() {
        let a = u16x16::new(
            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
@@ -1267,8 +1351,7 @@ fn _mm256_adds_epu16() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_adds_epu16_saturate() {
        let a = u16x16::splat(0xFFFF);
        let b = u16x16::splat(1);
@@ -1276,40 +1359,35 @@ fn _mm256_adds_epu16_saturate() {
        assert_eq!(r, a);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_and_si256() {
        assert_eq!(
            avx2::_mm256_and_si256(
                __m256i::splat(5), __m256i::splat(3)),__m256i::splat(1));
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_andnot_si256() {
        assert_eq!(
            avx2::_mm256_andnot_si256(__m256i::splat(5), __m256i::splat(3)),
            __m256i::splat(2));
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_avg_epu8() {
        let (a, b) = (u8x32::splat(3), u8x32::splat(9));
        let r = avx2::_mm256_avg_epu8(a, b);
        assert_eq!(r, u8x32::splat(6));
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_avg_epu16() {
        let (a, b) = (u16x16::splat(3), u16x16::splat(9));
        let r = avx2::_mm256_avg_epu16(a, b);
        assert_eq!(r, u16x16::splat(6));
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_blendv_epi8() {
        let (a,b) = (i8x32::splat(4),i8x32::splat(2));
        let mask = i8x32::splat(0).replace(2,-1);
@@ -1318,7 +1396,7 @@ fn _mm256_blendv_epi8() {
        assert_eq!(r,e);
    }

-    #[test]
+    #[simd_test = "avx2"]
    fn _mm256_cmpeq_epi8() {
        let a = i8x32::new(
            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
@@ -1330,7 +1408,7 @@ fn _mm256_cmpeq_epi8() {
        assert_eq!(r, i8x32::splat(0).replace(2,0xFFu8 as i8));
    }

-    #[test]
+    #[simd_test = "avx2"]
    fn _mm256_cmpeq_epi16() {
        let a = i16x16::new(
            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
@@ -1340,7 +1418,7 @@ fn _mm256_cmpeq_epi16() {
        assert_eq!(r, i16x16::splat(0).replace(2, 0xFFFFu16 as i16));
    }

-    #[test]
+    #[simd_test = "avx2"]
    fn _mm256_cmpeq_epi32() {
        let a = i32x8::new(0, 1, 2, 3,4,5,6,7);
        let b = i32x8::new(7,6,2,4,3, 2, 1, 0);
@@ -1348,7 +1426,7 @@ fn _mm256_cmpeq_epi32() {
        assert_eq!(r, i32x8::splat(0).replace(2, 0xFFFFFFFFu32 as i32));
    }

-    #[test]
+    #[simd_test = "avx2"]
    fn _mm256_cmpeq_epi64() {
        let a = i64x4::new(0, 1, 2, 3);
        let b = i64x4::new(3, 2, 2, 0);
@@ -1357,7 +1435,7 @@ fn _mm256_cmpeq_epi64() {
            2, 0xFFFFFFFFFFFFFFFFu64 as i64));
    }

-    #[test]
+    #[simd_test = "avx2"]
    fn _mm256_cmpgt_epi8() {
        let a = i8x32::splat(0).replace(0, 5);
        let b = i8x32::splat(0);
@@ -1365,7 +1443,7 @@ fn _mm256_cmpgt_epi8() {
        assert_eq!(r, i8x32::splat(0).replace(0, 0xFFu8 as i8));
    }

-    #[test]
+    #[simd_test = "avx2"]
    fn _mm256_cmpgt_epi16() {
        let a = i16x16::splat(0).replace(0, 5);
        let b = i16x16::splat(0);
@@ -1373,7 +1451,7 @@ fn _mm256_cmpgt_epi16() {
        assert_eq!(r, i16x16::splat(0).replace(0, 0xFFFFu16 as i16));
    }

-    #[test]
+    #[simd_test = "avx2"]
    fn _mm256_cmpgt_epi32() {
        let a = i32x8::splat(0).replace(0, 5);
        let b = i32x8::splat(0);
@@ -1381,7 +1459,7 @@ fn _mm256_cmpgt_epi32() {
        assert_eq!(r, i32x8::splat(0).replace(0, 0xFFFFFFFFu32 as i32));
    }

-    #[test]
+    #[simd_test = "avx2"]
    fn _mm256_cmpgt_epi64() {
        let a = i64x4::splat(0).replace(0, 5);
        let b = i64x4::splat(0);
@@ -1390,8 +1468,7 @@ fn _mm256_cmpgt_epi64() {
            0, 0xFFFFFFFFFFFFFFFFu64 as i64));
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_hadd_epi16() {
        let a = i16x16::splat(2);
        let b = i16x16::splat(4);
@@ -1400,8 +1477,7 @@ fn _mm256_hadd_epi16() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_hadd_epi32() {
        let a = i32x8::splat(2);
        let b = i32x8::splat(4);
@@ -1410,8 +1486,7 @@ fn _mm256_hadd_epi32() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_hadds_epi16() {
        let a = i16x16::splat(2).replace(0,0x7FFF).replace(1,1);
        let b = i16x16::splat(4);
@@ -1421,8 +1496,7 @@ fn _mm256_hadds_epi16() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature ="+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_hsub_epi16() {
        let a = i16x16::splat(2);
        let b = i16x16::splat(4);
@@ -1431,8 +1505,7 @@ fn _mm256_hsub_epi16() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_hsub_epi32() {
        let a = i32x8::splat(2);
        let b = i32x8::splat(4);
@@ -1441,8 +1514,7 @@ fn _mm256_hsub_epi32() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_hsubs_epi16() {
        let a = i16x16::splat(2).replace(0,0x7FFF).replace(1,-1);
        let b = i16x16::splat(4);
@@ -1451,8 +1523,7 @@ fn _mm256_hsubs_epi16() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_madd_epi16() {
        let a = i16x16::splat(2);
        let b = i16x16::splat(4);
@@ -1461,8 +1532,7 @@ fn _mm256_madd_epi16() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_maddubs_epi16() {
        let a = u8x32::splat(2);
        let b = u8x32::splat(4);
@@ -1471,8 +1541,7 @@ fn _mm256_maddubs_epi16() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_max_epi16() {
        let a = i16x16::splat(2);
        let b = i16x16::splat(4);
@@ -1480,8 +1549,7 @@ fn _mm256_max_epi16() {
        assert_eq!(r, b);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_max_epi32() {
        let a = i32x8::splat(2);
        let b = i32x8::splat(4);
@@ -1489,8 +1557,7 @@ fn _mm256_max_epi32() {
        assert_eq!(r, b);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_max_epi8() {
        let a = i8x32::splat(2);
        let b = i8x32::splat(4);
@@ -1498,8 +1565,7 @@ fn _mm256_max_epi8() {
        assert_eq!(r, b);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_max_epu16() {
        let a = u16x16::splat(2);
        let b = u16x16::splat(4);
@@ -1507,8 +1573,7 @@ fn _mm256_max_epu16() {
        assert_eq!(r, b);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_max_epu32() {
        let a = u32x8::splat(2);
        let b = u32x8::splat(4);
@@ -1516,8 +1581,7 @@ fn _mm256_max_epu32() {
        assert_eq!(r, b);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_max_epu8() {
        let a = u8x32::splat(2);
        let b = u8x32::splat(4);
@@ -1525,8 +1589,7 @@ fn _mm256_max_epu8() {
        assert_eq!(r, b);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_min_epi16() {
        let a = i16x16::splat(2);
        let b = i16x16::splat(4);
@@ -1534,8 +1597,7 @@ fn _mm256_min_epi16() {
        assert_eq!(r, a);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_min_epi32() {
        let a = i32x8::splat(2);
        let b = i32x8::splat(4);
@@ -1543,8 +1605,7 @@ fn _mm256_min_epi32() {
        assert_eq!(r, a);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_min_epi8() {
        let a = i8x32::splat(2);
        let b = i8x32::splat(4);
@@ -1552,8 +1613,7 @@ fn _mm256_min_epi8() {
        assert_eq!(r, a);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_min_epu16() {
        let a = u16x16::splat(2);
        let b = u16x16::splat(4);
@@ -1561,8 +1621,7 @@ fn _mm256_min_epu16() {
        assert_eq!(r, a);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_min_epu32() {
        let a = u32x8::splat(2);
        let b = u32x8::splat(4);
@@ -1570,8 +1629,7 @@ fn _mm256_min_epu32() {
        assert_eq!(r, a);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_min_epu8() {
        let a = u8x32::splat(2);
        let b = u8x32::splat(4);
@@ -1603,8 +1661,7 @@ fn _mm256_mpsadbw_epu8() {
    }
 **/

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_mul_epi32() {
        let a = i32x8::new(0, 0, 0, 0, 2, 2, 2, 2);
        let b = i32x8::new(1, 2, 3, 4, 5, 6, 7, 8);
@@ -1613,8 +1670,7 @@ fn _mm256_mul_epi32() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_mul_epu32() {
        let a = u32x8::new(0, 0, 0, 0, 2, 2, 2, 2);
        let b = u32x8::new(1, 2, 3, 4, 5, 6, 7, 8);
@@ -1623,8 +1679,7 @@ fn _mm256_mul_epu32() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_mulhi_epi16() {
        let a = i16x16::splat(6535);
        let b = i16x16::splat(6535);
@@ -1633,8 +1688,7 @@ fn _mm256_mulhi_epi16() {
        assert_eq!(r, e);
    }

-      #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_mulhi_epu16() {
        let a = u16x16::splat(6535);
        let b = u16x16::splat(6535);
@@ -1643,8 +1697,7 @@ fn _mm256_mulhi_epu16() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_mullo_epi16() {
        let a = i16x16::splat(2);
        let b = i16x16::splat(4);
@@ -1653,8 +1706,7 @@ fn _mm256_mullo_epi16() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_mullo_epi32() {
        let a = i32x8::splat(2);
        let b = i32x8::splat(4);
@@ -1663,8 +1715,7 @@ fn _mm256_mullo_epi32() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_mulhrs_epi16() {
        let a = i16x16::splat(2);
        let b = i16x16::splat(4);
@@ -1673,8 +1724,7 @@ fn _mm256_mulhrs_epi16() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_or_si256() {
        let a = __m256i::splat(-1);
        let b = __m256i::splat(0);
@@ -1682,8 +1732,7 @@ fn _mm256_or_si256() {
        assert_eq!(r, a);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_packs_epi16() {
        let a = i16x16::splat(2);
        let b = i16x16::splat(4);
@@ -1697,8 +1746,7 @@ fn _mm256_packs_epi16() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_packs_epi32() {
        let a = i32x8::splat(2);
        let b = i32x8::splat(4);
@@ -1712,8 +1760,7 @@ fn _mm256_packs_epi32() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_packus_epi16() {
        let a = i16x16::splat(2);
        let b = i16x16::splat(4);
@@ -1727,8 +1774,7 @@ fn _mm256_packus_epi16() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_packus_epi32() {
        let a = i32x8::splat(2);
        let b = i32x8::splat(4);
@@ -1742,8 +1788,7 @@ fn _mm256_packus_epi32() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_sad_epu8() {
        let a = u8x32::splat(2);
        let b = u8x32::splat(4);
@@ -1752,8 +1797,7 @@ fn _mm256_sad_epu8() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_sign_epi16() {
        let a = i16x16::splat(2);
        let b = i16x16::splat(-1);
@@ -1762,8 +1806,7 @@ fn _mm256_sign_epi16() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_sign_epi32() {
        let a = i32x8::splat(2);
        let b = i32x8::splat(-1);
@@ -1772,8 +1815,7 @@ fn _mm256_sign_epi32() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_sign_epi8() {
        let a = i8x32::splat(2);
        let b = i8x32::splat(-1);
@@ -1782,8 +1824,7 @@ fn _mm256_sign_epi8() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_sll_epi16() {
         assert_eq!(
            avx2::_mm256_sll_epi16(i16x16::splat(0xFF), i16x8::splat(0).replace(0,4)),
@@ -1791,8 +1832,7 @@ fn _mm256_sll_epi16() {

    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_sll_epi32() {
         assert_eq!(
            avx2::_mm256_sll_epi32(i32x8::splat(0xFFFF), i32x4::splat(0).replace(0,4)),
@@ -1800,8 +1840,7 @@ fn _mm256_sll_epi32() {

    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_sll_epi64() {
         assert_eq!(
            avx2::_mm256_sll_epi64(i64x4::splat(0xFFFFFFFF), i64x2::splat(0).replace(0,4)),
@@ -1809,32 +1848,28 @@ fn _mm256_sll_epi64() {

    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_slli_epi16() {
        assert_eq!(
            avx2::_mm256_slli_epi16(i16x16::splat(0xFF), 4),
            i16x16::splat(0xFF0));
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_slli_epi32() {
        assert_eq!(
            avx2::_mm256_slli_epi32(i32x8::splat(0xFFFF), 4),
            i32x8::splat(0xFFFF0));
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_slli_epi64() {
        assert_eq!(
            avx2::_mm256_slli_epi64(i64x4::splat(0xFFFFFFFF), 4),
            i64x4::splat(0xFFFFFFFF0));
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm_sllv_epi32() {
        let a = i32x4::splat(2);
        let b = i32x4::splat(1);
@@ -1843,8 +1878,7 @@ fn _mm_sllv_epi32() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_sllv_epi32() {
        let a = i32x8::splat(2);
        let b = i32x8::splat(1);
@@ -1852,8 +1886,8 @@ fn _mm256_sllv_epi32() {
        let e = i32x8::splat(4);
        assert_eq!(r, e);
    }
-    #[test]
-    #[target_feature = "+avx2"]
+
+    #[simd_test = "avx2"]
    fn _mm_sllv_epi64() {
        let a = i64x2::splat(2);
        let b = i64x2::splat(1);
@@ -1861,8 +1895,8 @@ fn _mm_sllv_epi64() {
        let e = i64x2::splat(4);
        assert_eq!(r, e);
    }
-    #[test]
-    #[target_feature = "+avx2"]
+
+    #[simd_test = "avx2"]
    fn _mm256_sllv_epi64() {
        let a = i64x4::splat(2);
        let b = i64x4::splat(1);
@@ -1871,8 +1905,7 @@ fn _mm256_sllv_epi64() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature ="+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_sra_epi16() {
         assert_eq!(
            avx2::_mm256_sra_epi16(
@@ -1880,8 +1913,7 @@ fn _mm256_sra_epi16() {
            i16x16::splat(-1));
    }

-    #[test]
-    #[target_feature ="+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_sra_epi32() {
         assert_eq!(
            avx2::_mm256_sra_epi32(
@@ -1889,24 +1921,21 @@ fn _mm256_sra_epi32() {
            i32x8::splat(-1));
    }

-    #[test]
-    #[target_feature ="+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_srai_epi16() {
           assert_eq!(
            avx2::_mm256_srai_epi16(
                i16x16::splat(-1), 1), i16x16::splat(-1));
    }

-    #[test]
-    #[target_feature ="+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_srai_epi32() {
           assert_eq!(
            avx2::_mm256_srai_epi32(
                i32x8::splat(-1), 1), i32x8::splat(-1));
    }

-    #[test]
-    #[target_feature ="+avx2"]
+    #[simd_test = "avx2"]
    fn _mm_srav_epi32() {
        let a = i32x4::splat(4);
        let count = i32x4::splat(1);
@@ -1915,8 +1944,7 @@ fn _mm_srav_epi32() {
        assert_eq!(r, e );
    }

-    #[test]
-    #[target_feature ="+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_srav_epi32() {
        let a = i32x8::splat(4);
        let count = i32x8::splat(1);
@@ -1925,8 +1953,7 @@ fn _mm256_srav_epi32() {
        assert_eq!(r, e );
    }

-    #[test]
-    #[target_feature ="+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_srl_epi16() {
        assert_eq!(
            avx2::_mm256_srl_epi16(
@@ -1934,8 +1961,7 @@ fn _mm256_srl_epi16() {
            i16x16::splat(0xF));
    }

-    #[test]
-    #[target_feature ="+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_srl_epi32() {
        assert_eq!(
            avx2::_mm256_srl_epi32(
@@ -1943,8 +1969,7 @@ fn _mm256_srl_epi32() {
            i32x8::splat(0xFFF));
    }

-    #[test]
-    #[target_feature ="+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_srl_epi64() {
        assert_eq!(
            avx2::_mm256_srl_epi64(
@@ -1952,32 +1977,28 @@ fn _mm256_srl_epi64() {
            i64x4::splat(0xFFFFFFF));
    }

-    #[test]
-    #[target_feature ="+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_srli_epi16() {
        assert_eq!(
            avx2::_mm256_srli_epi16(i16x16::splat(0xFF), 4),
            i16x16::splat(0xF));
    }

-    #[test]
-    #[target_feature ="+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_srli_epi32() {
        assert_eq!(
            avx2::_mm256_srli_epi32(i32x8::splat(0xFFFF), 4),
            i32x8::splat(0xFFF));
    }

-    #[test]
-    #[target_feature ="+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_srli_epi64() {
        assert_eq!(
            avx2::_mm256_srli_epi64(i64x4::splat(0xFFFFFFFF), 4),
            i64x4::splat(0xFFFFFFF));
    }

-    #[test]
-    #[target_feature ="+avx2"]
+    #[simd_test = "avx2"]
    fn _mm_srlv_epi32() {
        let a = i32x4::splat(2);
        let count = i32x4::splat(1);
@@ -1986,8 +2007,7 @@ fn _mm_srlv_epi32() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_srlv_epi32() {
        let a = i32x8::splat(2);
        let count = i32x8::splat(1);
@@ -1996,8 +2016,7 @@ fn _mm256_srlv_epi32() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm_srlv_epi64() {
        let a = i64x2::splat(2);
        let count = i64x2::splat(1);
@@ -2007,8 +2026,7 @@ fn _mm_srlv_epi64() {
    }


-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_srlv_epi64() {
        let a = i64x4::splat(2);
        let count = i64x4::splat(1);
@@ -2017,8 +2035,7 @@ fn _mm256_srlv_epi64() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_sub_epi16() {
        let a = i16x16::splat(4);
        let b = i16x16::splat(2);
@@ -2026,8 +2043,7 @@ fn _mm256_sub_epi16() {
        assert_eq!(r, b);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_sub_epi32() {
        let a = i32x8::splat(4);
        let b = i32x8::splat(2);
@@ -2035,8 +2051,7 @@ fn _mm256_sub_epi32() {
        assert_eq!(r, b);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_sub_epi64() {
        let a = i64x4::splat(4);
        let b = i64x4::splat(2);
@@ -2044,8 +2059,7 @@ fn _mm256_sub_epi64() {
        assert_eq!(r, b);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_sub_epi8() {
        let a = i8x32::splat(4);
        let b = i8x32::splat(2);
@@ -2053,8 +2067,7 @@ fn _mm256_sub_epi8() {
        assert_eq!(r, b);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_subs_epi16() {
        let a = i16x16::splat(4);
        let b = i16x16::splat(2);
@@ -2062,8 +2075,7 @@ fn _mm256_subs_epi16() {
        assert_eq!(r, b);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_subs_epi8() {
        let a = i8x32::splat(4);
        let b = i8x32::splat(2);
@@ -2071,8 +2083,7 @@ fn _mm256_subs_epi8() {
        assert_eq!(r, b);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_subs_epu16() {
        let a = u16x16::splat(4);
        let b = u16x16::splat(2);
@@ -2080,8 +2091,7 @@ fn _mm256_subs_epu16() {
        assert_eq!(r, b);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_subs_epu8() {
        let a = u8x32::splat(4);
        let b = u8x32::splat(2);
@@ -2089,13 +2099,10 @@ fn _mm256_subs_epu8() {
        assert_eq!(r, b);
    }

-    #[test]
-    #[target_feature = "+avx2"]
+    #[simd_test = "avx2"]
    fn _mm256_xor_si256() {
        assert_eq!(
            avx2::_mm256_xor_si256(__m256i::splat(5), __m256i::splat(3)),
            __m256i::splat(6));
    }
-
-
 }
@@ -8,7 +8,7 @@
 //! provides a quick overview of the available instructions.

 #[cfg(test)]
-use assert_instr::assert_instr;
+use stdsimd_test::assert_instr;

 #[allow(dead_code)]
 extern "C" {
@@ -183,24 +183,24 @@ pub fn _mm_tzcnt_u64(x: u64) -> u64 {
    x.trailing_zeros() as u64
 }

-#[cfg(all(test, target_feature = "bmi", any(target_arch = "x86", target_arch = "x86_64")))]
+#[cfg(test)]
 mod tests {
+    use stdsimd_test::simd_test;
+
    use x86::bmi;

-    #[test]
-    #[target_feature = "+bmi"]
+    #[simd_test = "bmi"]
    fn _bextr_u32() {
        assert_eq!(bmi::_bextr_u32(0b0101_0000u32, 4, 4), 0b0000_0101u32);
    }

-    #[test]
-    #[target_feature = "+bmi"]
+    #[simd_test = "bmi"]
+    #[cfg(not(target_arch = "x86"))]
    fn _bextr_u64() {
        assert_eq!(bmi::_bextr_u64(0b0101_0000u64, 4, 4), 0b0000_0101u64);
    }

-    #[test]
-    #[target_feature = "+bmi"]
+    #[simd_test = "bmi"]
    fn _andn_u32() {
        assert_eq!(bmi::_andn_u32(0, 0), 0);
        assert_eq!(bmi::_andn_u32(0, 1), 1);
@@ -214,8 +214,8 @@ fn _andn_u32() {
        assert_eq!(bmi::_andn_u32(0b0100_0000u32, 0b0101_1101u32), 0b0001_1101u32);
    }

-    #[test]
-    #[target_feature = "+bmi"]
+    #[simd_test = "bmi"]
+    #[cfg(not(target_arch = "x86"))]
    fn _andn_u64() {
        assert_eq!(bmi::_andn_u64(0, 0), 0);
        assert_eq!(bmi::_andn_u64(0, 1), 1);
@@ -229,62 +229,57 @@ fn _andn_u64() {
        assert_eq!(bmi::_andn_u64(0b0100_0000u64, 0b0101_1101u64), 0b0001_1101u64);
    }

-    #[test]
-    #[target_feature = "+bmi"]
+    #[simd_test = "bmi"]
    fn _blsi_u32() {
        assert_eq!(bmi::_blsi_u32(0b1101_0000u32), 0b0001_0000u32);
    }

-    #[test]
-    #[target_feature = "+bmi"]
+    #[simd_test = "bmi"]
+    #[cfg(not(target_arch = "x86"))]
    fn _blsi_u64() {
        assert_eq!(bmi::_blsi_u64(0b1101_0000u64), 0b0001_0000u64);
    }

-    #[test]
-    #[target_feature = "+bmi"]
+    #[simd_test = "bmi"]
    fn _blsmsk_u32() {
        assert_eq!(bmi::_blsmsk_u32(0b0011_0000u32), 0b0001_1111u32);
    }

-    #[test]
-    #[target_feature = "+bmi"]
+    #[simd_test = "bmi"]
+    #[cfg(not(target_arch = "x86"))]
    fn _blsmsk_u64() {
        assert_eq!(bmi::_blsmsk_u64(0b0011_0000u64), 0b0001_1111u64);
    }

-    #[test]
-    #[target_feature = "+bmi"]
+    #[simd_test = "bmi"]
    fn _blsr_u32() {
        /// TODO: test the behavior when the input is 0
        assert_eq!(bmi::_blsr_u32(0b0011_0000u32), 0b0010_0000u32);
    }

-    #[test]
-    #[target_feature = "+bmi"]
+    #[simd_test = "bmi"]
+    #[cfg(not(target_arch = "x86"))]
    fn _blsr_u64() {
        /// TODO: test the behavior when the input is 0
        assert_eq!(bmi::_blsr_u64(0b0011_0000u64), 0b0010_0000u64);
    }

-    #[test]
-    #[target_feature = "+bmi"]
+    #[simd_test = "bmi"]
    fn _tzcnt_u16() {
        assert_eq!(bmi::_tzcnt_u16(0b0000_0001u16), 0u16);
        assert_eq!(bmi::_tzcnt_u16(0b0000_0000u16), 16u16);
        assert_eq!(bmi::_tzcnt_u16(0b1001_0000u16), 4u16);
    }

-    #[test]
-    #[target_feature = "+bmi"]
+    #[simd_test = "bmi"]
    fn _tzcnt_u32() {
        assert_eq!(bmi::_tzcnt_u32(0b0000_0001u32), 0u32);
        assert_eq!(bmi::_tzcnt_u32(0b0000_0000u32), 32u32);
        assert_eq!(bmi::_tzcnt_u32(0b1001_0000u32), 4u32);
    }

-    #[test]
-    #[target_feature = "+bmi"]
+    #[simd_test = "bmi"]
+    #[cfg(not(target_arch = "x86"))]
    fn _tzcnt_u64() {
        assert_eq!(bmi::_tzcnt_u64(0b0000_0001u64), 0u64);
        assert_eq!(bmi::_tzcnt_u64(0b0000_0000u64), 64u64);
@@ -8,7 +8,7 @@
 //! provides a quick overview of the available instructions.

 #[cfg(test)]
-use assert_instr::assert_instr;
+use stdsimd_test::assert_instr;

 /// Unsigned multiply without affecting flags.
 ///
@@ -112,12 +112,13 @@ pub fn _pext_u64(a: u64, mask: u64) -> u64 {
    unsafe { x86_bmi2_pext_64(a, mask) }
 }

-#[cfg(all(test, target_feature = "bmi2", any(target_arch = "x86", target_arch = "x86_64")))]
+#[cfg(test)]
 mod tests {
+    use stdsimd_test::simd_test;
+
    use x86::bmi2;

-    #[test]
-    #[target_feature = "+bmi2"]
+    #[simd_test = "bmi2"]
    fn _pext_u32() {
        let n  = 0b1011_1110_1001_0011u32;

@@ -131,8 +132,8 @@ fn _pext_u32() {
        assert_eq!(bmi2::_pext_u32(n, m1), s1);
    }

-    #[test]
-    #[target_feature = "+bmi2"]
+    #[simd_test = "bmi2"]
+    #[cfg(not(target_arch = "x86"))]
    fn _pext_u64() {
        let n  = 0b1011_1110_1001_0011u64;

@@ -146,8 +147,7 @@ fn _pext_u64() {
        assert_eq!(bmi2::_pext_u64(n, m1), s1);
    }

-    #[test]
-    #[target_feature = "+bmi2"]
+    #[simd_test = "bmi2"]
    fn _pdep_u32() {
        let n  = 0b1011_1110_1001_0011u32;

@@ -161,8 +161,8 @@ fn _pdep_u32() {
        assert_eq!(bmi2::_pdep_u32(n, m1), s1);
    }

-    #[test]
-    #[target_feature = "+bmi2"]
+    #[simd_test = "bmi2"]
+    #[cfg(not(target_arch = "x86"))]
    fn _pdep_u64() {
        let n  = 0b1011_1110_1001_0011u64;

@@ -176,24 +176,22 @@ fn _pdep_u64() {
        assert_eq!(bmi2::_pdep_u64(n, m1), s1);
    }

-    #[test]
-    #[target_feature = "+bmi2"]
+    #[simd_test = "bmi2"]
    fn _bzhi_u32() {
        let n = 0b1111_0010u32;
        let s = 0b0001_0010u32;
        assert_eq!(bmi2::_bzhi_u32(n, 5), s);
    }

-    #[test]
-    #[target_feature = "+bmi2"]
+    #[simd_test = "bmi2"]
+    #[cfg(not(target_arch = "x86"))]
    fn _bzhi_u64() {
        let n = 0b1111_0010u64;
        let s = 0b0001_0010u64;
        assert_eq!(bmi2::_bzhi_u64(n, 5), s);
    }

-    #[test]
-    #[target_feature = "+bmi2"]
+    #[simd_test = "bmi2"]
    fn _mulx_u32() {
        let a: u32 = 4_294_967_200;
        let b: u32 = 2;
@@ -205,8 +203,8 @@ fn _mulx_u32() {
        assert_eq!(hi, 0b0001u32);
    }

-    #[test]
-    #[target_feature = "+bmi2"]
+    #[simd_test = "bmi2"]
+    #[cfg(not(target_arch = "x86"))]
    fn _mulx_u64() {
        let a: u64 = 9_223_372_036_854_775_800;
        let b: u64 = 100;
@@ -20,6 +20,9 @@

 #[macro_use]
 mod macros;
+#[macro_use]
+mod runtime;
+
 mod sse;
 mod sse2;
 mod ssse3;
@@ -32,6 +35,3 @@
 mod bmi;
 mod bmi2;
 mod tbm;
-
-#[macro_use]
-mod runtime;
@@ -2,7 +2,7 @@
 use v128::*;

 #[cfg(test)]
-use assert_instr::assert_instr;
+use stdsimd_test::assert_instr;

 /// Adds the first component of `a` and `b`, the other components are copied
 /// from `a`.
@@ -127,7 +127,7 @@ pub fn _mm_rsqrt_ps(a: f32x4) -> f32x4 {
 }

 /// Compare the first single-precision (32-bit) floating-point element of `a`
-/// and `b`, and return the minimum value in the first element of the return 
+/// and `b`, and return the minimum value in the first element of the return
 /// value, the other elements are copied from `a`.
 #[inline(always)]
 #[target_feature = "+sse"]
@@ -146,7 +146,7 @@ pub fn _mm_min_ps(a: f32x4, b: f32x4) -> f32x4 {
 }

 /// Compare the first single-precision (32-bit) floating-point element of `a`
-/// and `b`, and return the maximum value in the first element of the return 
+/// and `b`, and return the maximum value in the first element of the return
 /// value, the other elements are copied from `a`.
 #[inline(always)]
 #[target_feature = "+sse"]
@@ -164,14 +164,103 @@ pub fn _mm_max_ps(a: f32x4, b: f32x4) -> f32x4 {
    unsafe { maxps(a, b) }
 }

-/// Unpack and interleave single-precision (32-bit) floating-point elements
-/// from the high half of `a` and `b`;
+// Shuffle packed single-precision (32-bit) floating-point elements in `a` and `b`
+// using `mask`.
+// The lower half of result takes values from `a` and the higher half from `b`.
+// Mask is split to 2 control bits each to index the element from inputs.
 #[inline(always)]
 #[target_feature = "+sse"]
+pub fn _mm_shuffle_ps(a: f32x4, b: f32x4, mask: i32) -> f32x4 {
+    let mask = (mask & 0xFF) as u8;
+
+    macro_rules! shuffle_done {
+        ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
+            unsafe {
+                simd_shuffle4(a, b, [$x01, $x23, $x45, $x67])
+            }
+        }
+    }
+    macro_rules! shuffle_x67 {
+        ($x01:expr, $x23:expr, $x45:expr) => {
+            match (mask >> 6) & 0b11 {
+                0b00 => shuffle_done!($x01, $x23, $x45, 4),
+                0b01 => shuffle_done!($x01, $x23, $x45, 5),
+                0b10 => shuffle_done!($x01, $x23, $x45, 6),
+                _ => shuffle_done!($x01, $x23, $x45, 7),
+            }
+        }
+    }
+    macro_rules! shuffle_x45 {
+        ($x01:expr, $x23:expr) => {
+            match (mask >> 4) & 0b11 {
+                0b00 => shuffle_x67!($x01, $x23, 4),
+                0b01 => shuffle_x67!($x01, $x23, 5),
+                0b10 => shuffle_x67!($x01, $x23, 6),
+                _ => shuffle_x67!($x01, $x23, 7),
+            }
+        }
+    }
+    macro_rules! shuffle_x23 {
+        ($x01:expr) => {
+            match (mask >> 2) & 0b11 {
+                0b00 => shuffle_x45!($x01, 0),
+                0b01 => shuffle_x45!($x01, 1),
+                0b10 => shuffle_x45!($x01, 2),
+                _ => shuffle_x45!($x01, 3),
+            }
+        }
+    }
+    match mask & 0b11 {
+        0b00 => shuffle_x23!(0),
+        0b01 => shuffle_x23!(1),
+        0b10 => shuffle_x23!(2),
+        _ => shuffle_x23!(3),
+    }
+}
+
+#[cfg(test)]
+#[cfg_attr(test, assert_instr(shufps))]
+#[target_feature = "+sse"]
+fn _test_mm_shuffle_ps(a: f32x4, b: f32x4) -> f32x4 {
+    _mm_shuffle_ps(a, b, 3)
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements
+/// from the higher half of `a` and `b`.
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(unpckhps))]
 pub fn _mm_unpackhi_ps(a: f32x4, b: f32x4) -> f32x4 {
    unsafe { simd_shuffle4(a, b, [2, 6, 3, 7]) }
 }

+/// Unpack and interleave single-precision (32-bit) floating-point elements
+/// from the lower half of `a` and `b`.
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(unpcklps))]
+pub fn _mm_unpacklo_ps(a: f32x4, b: f32x4) -> f32x4 {
+    unsafe { simd_shuffle4(a, b, [0, 4, 1, 5]) }
+}
+
+/// Combine higher half of `a` and `b`. The highwe half of `b` occupies the lower
+/// half of result.
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(movhlps))]
+pub fn _mm_movehl_ps(a: f32x4, b: f32x4) -> f32x4 {
+    unsafe { simd_shuffle4(a, b, [6, 7, 2, 3]) }
+}
+
+/// Combine lower half of `a` and `b`. The lower half of `b` occupies the higher
+/// half of result.
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(unpcklpd))]
+pub fn _mm_movelh_ps(a: f32x4, b: f32x4) -> f32x4 {
+    unsafe { simd_shuffle4(a, b, [0, 1, 4, 5]) }
+}
+
 /// Return a mask of the most significant bit of each element in `a`.
 ///
 /// The mask is stored in the 4 least significant bits of the return value.
@@ -217,13 +306,13 @@ pub fn _mm_movemask_ps(a: f32x4) -> i32 {
    fn movmskps(a: f32x4) -> i32;
 }

-#[cfg(all(test, target_feature = "sse", any(target_arch = "x86", target_arch = "x86_64")))]
+#[cfg(test)]
 mod tests {
    use v128::*;
    use x86::sse;
+    use stdsimd_test::simd_test;

-    #[test]
-    #[target_feature = "+sse"]
+    #[simd_test = "sse"]
    fn _mm_add_ps() {
        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
@@ -231,8 +320,7 @@ fn _mm_add_ps() {
        assert_eq!(r, f32x4::new(-101.0, 25.0, 0.0, -15.0));
    }

-    #[test]
-    #[target_feature = "+sse"]
+    #[simd_test = "sse"]
    fn _mm_add_ss() {
        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
@@ -240,8 +328,7 @@ fn _mm_add_ss() {
        assert_eq!(r, f32x4::new(-101.0, 5.0, 0.0, -10.0));
    }

-    #[test]
-    #[target_feature = "+sse"]
+    #[simd_test = "sse"]
    fn _mm_sub_ps() {
        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
@@ -249,8 +336,7 @@ fn _mm_sub_ps() {
        assert_eq!(r, f32x4::new(99.0, -15.0, 0.0, -5.0));
    }

-    #[test]
-    #[target_feature = "+sse"]
+    #[simd_test = "sse"]
    fn _mm_sub_ss() {
        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
@@ -258,8 +344,7 @@ fn _mm_sub_ss() {
        assert_eq!(r, f32x4::new(99.0, 5.0, 0.0, -10.0));
    }

-    #[test]
-    #[target_feature = "+sse"]
+    #[simd_test = "sse"]
    fn _mm_mul_ps() {
        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
@@ -267,8 +352,7 @@ fn _mm_mul_ps() {
        assert_eq!(r, f32x4::new(100.0, 100.0, 0.0, 50.0));
    }

-    #[test]
-    #[target_feature = "+sse"]
+    #[simd_test = "sse"]
    fn _mm_mul_ss() {
        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
@@ -276,8 +360,7 @@ fn _mm_mul_ss() {
        assert_eq!(r, f32x4::new(100.0, 5.0, 0.0, -10.0));
    }

-    #[test]
-    #[target_feature = "+sse"]
+    #[simd_test = "sse"]
    fn _mm_div_ps() {
        let a = f32x4::new(-1.0, 5.0, 2.0, -10.0);
        let b = f32x4::new(-100.0, 20.0, 0.2, -5.0);
@@ -285,8 +368,7 @@ fn _mm_div_ps() {
        assert_eq!(r, f32x4::new(0.01, 0.25, 10.0, 2.0));
    }

-    #[test]
-    #[target_feature = "+sse"]
+    #[simd_test = "sse"]
    fn _mm_div_ss() {
        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
@@ -294,8 +376,7 @@ fn _mm_div_ss() {
        assert_eq!(r, f32x4::new(0.01, 5.0, 0.0, -10.0));
    }

-    #[test]
-    #[target_feature = "+sse"]
+    #[simd_test = "sse"]
    fn _mm_sqrt_ss() {
        let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
        let r = sse::_mm_sqrt_ss(a);
@@ -303,8 +384,7 @@ fn _mm_sqrt_ss() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature = "+sse"]
+    #[simd_test = "sse"]
    fn _mm_sqrt_ps() {
        let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
        let r = sse::_mm_sqrt_ps(a);
@@ -312,8 +392,7 @@ fn _mm_sqrt_ps() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature = "+sse"]
+    #[simd_test = "sse"]
    fn _mm_rcp_ss() {
        let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
        let r = sse::_mm_rcp_ss(a);
@@ -321,8 +400,7 @@ fn _mm_rcp_ss() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature = "+sse"]
+    #[simd_test = "sse"]
    fn _mm_rcp_ps() {
        let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
        let r = sse::_mm_rcp_ps(a);
@@ -330,8 +408,7 @@ fn _mm_rcp_ps() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature = "+sse"]
+    #[simd_test = "sse"]
    fn _mm_rsqrt_ss() {
        let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
        let r = sse::_mm_rsqrt_ss(a);
@@ -339,8 +416,7 @@ fn _mm_rsqrt_ss() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature = "+sse"]
+    #[simd_test = "sse"]
    fn _mm_rsqrt_ps() {
        let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
        let r = sse::_mm_rsqrt_ps(a);
@@ -348,8 +424,7 @@ fn _mm_rsqrt_ps() {
        assert_eq!(r, e);
    }

-    #[test]
-    #[target_feature = "+sse"]
+    #[simd_test = "sse"]
    fn _mm_min_ss() {
        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
@@ -357,8 +432,7 @@ fn _mm_min_ss() {
        assert_eq!(r, f32x4::new(-100.0, 5.0, 0.0, -10.0));
    }

-    #[test]
-    #[target_feature = "+sse"]
+    #[simd_test = "sse"]
    fn _mm_min_ps() {
        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
@@ -366,8 +440,7 @@ fn _mm_min_ps() {
        assert_eq!(r, f32x4::new(-100.0, 5.0, 0.0, -10.0));
    }

-    #[test]
-    #[target_feature = "+sse"]
+    #[simd_test = "sse"]
    fn _mm_max_ss() {
        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
@@ -375,8 +448,7 @@ fn _mm_max_ss() {
        assert_eq!(r, f32x4::new(-1.0, 5.0, 0.0, -10.0));
    }

-    #[test]
-    #[target_feature = "+sse"]
+    #[simd_test = "sse"]
    fn _mm_max_ps() {
        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
@@ -384,8 +456,16 @@ fn _mm_max_ps() {
        assert_eq!(r, f32x4::new(-1.0, 20.0, 0.0, -5.0));
    }

-    #[test]
-    #[target_feature = "+sse"]
+    #[simd_test = "sse"]
+    fn _mm_shuffle_ps() {
+        let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
+        let b = f32x4::new(5.0, 6.0, 7.0, 8.0);
+        let mask = 0b00_01_01_11;
+        let r = sse::_mm_shuffle_ps(a, b, mask);
+        assert_eq!(r, f32x4::new(4.0, 2.0, 6.0, 5.0));
+    }
+
+    #[simd_test = "sse"]
    fn _mm_unpackhi_ps() {
        let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
        let b = f32x4::new(5.0, 6.0, 7.0, 8.0);
@@ -393,8 +473,31 @@ fn _mm_unpackhi_ps() {
        assert_eq!(r, f32x4::new(3.0, 7.0, 4.0, 8.0));
    }

-    #[test]
-    #[target_feature = "+sse"]
+    #[simd_test = "sse"]
+    fn _mm_unpacklo_ps() {
+        let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
+        let b = f32x4::new(5.0, 6.0, 7.0, 8.0);
+        let r = sse::_mm_unpacklo_ps(a, b);
+        assert_eq!(r, f32x4::new(1.0, 5.0, 2.0, 6.0));
+    }
+
+    #[simd_test = "sse"]
+    fn _mm_movehl_ps() {
+        let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
+        let b = f32x4::new(5.0, 6.0, 7.0, 8.0);
+        let r = sse::_mm_movehl_ps(a, b);
+        assert_eq!(r, f32x4::new(7.0, 8.0, 3.0, 4.0));
+    }
+
+    #[simd_test = "sse"]
+    fn _mm_movelh_ps() {
+        let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
+        let b = f32x4::new(5.0, 6.0, 7.0, 8.0);
+        let r = sse::_mm_movelh_ps(a, b);
+        assert_eq!(r, f32x4::new(1.0, 2.0, 5.0, 6.0));
+    }
+
+    #[simd_test = "sse"]
    fn _mm_movemask_ps() {
        let r = sse::_mm_movemask_ps(f32x4::new(-1.0, 5.0, -5.0, 0.0));
        assert_eq!(r, 0b0101);
@@ -10,7 +10,7 @@
 use v64::*;

 #[cfg(test)]
-use assert_instr::assert_instr;
+use stdsimd_test::assert_instr;

 /// Provide a hint to the processor that the code sequence is a spin-wait loop.
 ///
@@ -1720,35 +1720,36 @@ pub unsafe fn _mm_store_pd(mem_addr: *mut f64, a: f64x2) {
    fn movmskpd(a: f64x2) -> i32;
 }

-#[cfg(all(test, target_feature = "sse2", any(target_arch = "x86", target_arch = "x86_64")))]
+#[cfg(test)]
 mod tests {
    use std::os::raw::c_void;
+    use stdsimd_test::simd_test;

    use v128::*;
    use x86::{__m128i, sse2};

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_pause() {
        sse2::_mm_pause();
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_clflush() {
        let x = 0;
        unsafe { sse2::_mm_clflush(&x as *const _ as *mut c_void); }
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_lfence() {
        sse2::_mm_lfence();
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_mfence() {
        sse2::_mm_mfence();
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_add_epi8() {
        let a = i8x16::new(
            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
@@ -1760,7 +1761,7 @@ fn _mm_add_epi8() {
        assert_eq!(r, e);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_add_epi8_overflow() {
        let a = i8x16::splat(0x7F);
        let b = i8x16::splat(1);
@@ -1768,7 +1769,7 @@ fn _mm_add_epi8_overflow() {
        assert_eq!(r, i8x16::splat(-128));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_add_epi16() {
        let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
        let b = i16x8::new(8, 9, 10, 11, 12, 13, 14, 15);
@@ -1777,7 +1778,7 @@ fn _mm_add_epi16() {
        assert_eq!(r, e);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_add_epi32() {
        let a = i32x4::new(0, 1, 2, 3);
        let b = i32x4::new(4, 5, 6, 7);
@@ -1786,7 +1787,7 @@ fn _mm_add_epi32() {
        assert_eq!(r, e);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_add_epi64() {
        let a = i64x2::new(0, 1);
        let b = i64x2::new(2, 3);
@@ -1795,7 +1796,7 @@ fn _mm_add_epi64() {
        assert_eq!(r, e);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_adds_epi8() {
        let a = i8x16::new(
            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
@@ -1807,7 +1808,7 @@ fn _mm_adds_epi8() {
        assert_eq!(r, e);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_adds_epi8_saturate_positive() {
        let a = i8x16::splat(0x7F);
        let b = i8x16::splat(1);
@@ -1815,7 +1816,7 @@ fn _mm_adds_epi8_saturate_positive() {
        assert_eq!(r, a);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_adds_epi8_saturate_negative() {
        let a = i8x16::splat(-0x80);
        let b = i8x16::splat(-1);
@@ -1823,7 +1824,7 @@ fn _mm_adds_epi8_saturate_negative() {
        assert_eq!(r, a);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_adds_epi16() {
        let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
        let b = i16x8::new(8, 9, 10, 11, 12, 13, 14, 15);
@@ -1832,7 +1833,7 @@ fn _mm_adds_epi16() {
        assert_eq!(r, e);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_adds_epi16_saturate_positive() {
        let a = i16x8::splat(0x7FFF);
        let b = i16x8::splat(1);
@@ -1840,7 +1841,7 @@ fn _mm_adds_epi16_saturate_positive() {
        assert_eq!(r, a);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_adds_epi16_saturate_negative() {
        let a = i16x8::splat(-0x8000);
        let b = i16x8::splat(-1);
@@ -1848,7 +1849,7 @@ fn _mm_adds_epi16_saturate_negative() {
        assert_eq!(r, a);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_adds_epu8() {
        let a = u8x16::new(
            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
@@ -1860,7 +1861,7 @@ fn _mm_adds_epu8() {
        assert_eq!(r, e);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_adds_epu8_saturate() {
        let a = u8x16::splat(0xFF);
        let b = u8x16::splat(1);
@@ -1868,7 +1869,7 @@ fn _mm_adds_epu8_saturate() {
        assert_eq!(r, a);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_adds_epu16() {
        let a = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
        let b = u16x8::new(8, 9, 10, 11, 12, 13, 14, 15);
@@ -1877,7 +1878,7 @@ fn _mm_adds_epu16() {
        assert_eq!(r, e);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_adds_epu16_saturate() {
        let a = u16x8::splat(0xFFFF);
        let b = u16x8::splat(1);
@@ -1885,21 +1886,21 @@ fn _mm_adds_epu16_saturate() {
        assert_eq!(r, a);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_avg_epu8() {
        let (a, b) = (u8x16::splat(3), u8x16::splat(9));
        let r = sse2::_mm_avg_epu8(a, b);
        assert_eq!(r, u8x16::splat(6));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_avg_epu16() {
        let (a, b) = (u16x8::splat(3), u16x8::splat(9));
        let r = sse2::_mm_avg_epu16(a, b);
        assert_eq!(r, u16x8::splat(6));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_madd_epi16() {
        let a = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
        let b = i16x8::new(9, 10, 11, 12, 13, 14, 15, 16);
@@ -1908,7 +1909,7 @@ fn _mm_madd_epi16() {
        assert_eq!(r, e);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_max_epi16() {
        let a = i16x8::splat(1);
        let b = i16x8::splat(-1);
@@ -1916,7 +1917,7 @@ fn _mm_max_epi16() {
        assert_eq!(r, a);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_max_epu8() {
        let a = u8x16::splat(1);
        let b = u8x16::splat(255);
@@ -1924,7 +1925,7 @@ fn _mm_max_epu8() {
        assert_eq!(r, b);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_min_epi16() {
        let a = i16x8::splat(1);
        let b = i16x8::splat(-1);
@@ -1932,7 +1933,7 @@ fn _mm_min_epi16() {
        assert_eq!(r, b);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_min_epu8() {
        let a = u8x16::splat(1);
        let b = u8x16::splat(255);
@@ -1940,28 +1941,28 @@ fn _mm_min_epu8() {
        assert_eq!(r, a);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_mulhi_epi16() {
        let (a, b) = (i16x8::splat(1000), i16x8::splat(-1001));
        let r = sse2::_mm_mulhi_epi16(a, b);
        assert_eq!(r, i16x8::splat(-16));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_mulhi_epu16() {
        let (a, b) = (u16x8::splat(1000), u16x8::splat(1001));
        let r = sse2::_mm_mulhi_epu16(a, b);
        assert_eq!(r, u16x8::splat(15));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_mullo_epi16() {
        let (a, b) = (i16x8::splat(1000), i16x8::splat(-1001));
        let r = sse2::_mm_mullo_epi16(a, b);
        assert_eq!(r, i16x8::splat(-17960));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_mul_epu32() {
        let a = u32x4::from(u64x2::new(1_000_000_000, 1 << 34));
        let b = u32x4::from(u64x2::new(1_000_000_000, 1 << 35));
@@ -1970,7 +1971,7 @@ fn _mm_mul_epu32() {
        assert_eq!(r, e);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_sad_epu8() {
        let a = u8x16::new(
            255, 254, 253, 252, 1, 2, 3, 4,
@@ -1983,42 +1984,42 @@ fn _mm_sad_epu8() {
        assert_eq!(r, e);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_sub_epi8() {
        let (a, b) = (i8x16::splat(5), i8x16::splat(6));
        let r = sse2::_mm_sub_epi8(a, b);
        assert_eq!(r, i8x16::splat(-1));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_sub_epi16() {
        let (a, b) = (i16x8::splat(5), i16x8::splat(6));
        let r = sse2::_mm_sub_epi16(a, b);
        assert_eq!(r, i16x8::splat(-1));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_sub_epi32() {
        let (a, b) = (i32x4::splat(5), i32x4::splat(6));
        let r = sse2::_mm_sub_epi32(a, b);
        assert_eq!(r, i32x4::splat(-1));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_sub_epi64() {
        let (a, b) = (i64x2::splat(5), i64x2::splat(6));
        let r = sse2::_mm_sub_epi64(a, b);
        assert_eq!(r, i64x2::splat(-1));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_subs_epi8() {
        let (a, b) = (i8x16::splat(5), i8x16::splat(2));
        let r = sse2::_mm_subs_epi8(a, b);
        assert_eq!(r, i8x16::splat(3));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_subs_epi8_saturate_positive() {
        let a = i8x16::splat(0x7F);
        let b = i8x16::splat(-1);
@@ -2026,7 +2027,7 @@ fn _mm_subs_epi8_saturate_positive() {
        assert_eq!(r, a);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_subs_epi8_saturate_negative() {
        let a = i8x16::splat(-0x80);
        let b = i8x16::splat(1);
@@ -2034,14 +2035,14 @@ fn _mm_subs_epi8_saturate_negative() {
        assert_eq!(r, a);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_subs_epi16() {
        let (a, b) = (i16x8::splat(5), i16x8::splat(2));
        let r = sse2::_mm_subs_epi16(a, b);
        assert_eq!(r, i16x8::splat(3));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_subs_epi16_saturate_positive() {
        let a = i16x8::splat(0x7FFF);
        let b = i16x8::splat(-1);
@@ -2049,7 +2050,7 @@ fn _mm_subs_epi16_saturate_positive() {
        assert_eq!(r, a);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_subs_epi16_saturate_negative() {
        let a = i16x8::splat(-0x8000);
        let b = i16x8::splat(1);
@@ -2057,14 +2058,14 @@ fn _mm_subs_epi16_saturate_negative() {
        assert_eq!(r, a);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_subs_epu8() {
        let (a, b) = (u8x16::splat(5), u8x16::splat(2));
        let r = sse2::_mm_subs_epu8(a, b);
        assert_eq!(r, u8x16::splat(3));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_subs_epu8_saturate() {
        let a = u8x16::splat(0);
        let b = u8x16::splat(1);
@@ -2072,14 +2073,14 @@ fn _mm_subs_epu8_saturate() {
        assert_eq!(r, a);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_subs_epu16() {
        let (a, b) = (u16x8::splat(5), u16x8::splat(2));
        let r = sse2::_mm_subs_epu16(a, b);
        assert_eq!(r, u16x8::splat(3));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_subs_epu16_saturate() {
        let a = u16x8::splat(0);
        let b = u16x8::splat(1);
@@ -2087,7 +2088,7 @@ fn _mm_subs_epu16_saturate() {
        assert_eq!(r, a);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_slli_si128() {
        let a = __m128i::new(
            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
@@ -2119,7 +2120,7 @@ fn _mm_slli_si128() {
        assert_eq!(r, __m128i::splat(0));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_slli_epi16() {
        let a = i16x8::new(
            0xFFFF as u16 as i16, 0x0FFF, 0x00FF, 0x000F, 0, 0, 0, 0);
@@ -2130,7 +2131,7 @@ fn _mm_slli_epi16() {
        assert_eq!(r, e);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_sll_epi16() {
        let a = i16x8::new(0xFF, 0, 0, 0, 0, 0, 0, 0);
        let r = sse2::_mm_sll_epi16(a, i16x8::new(4, 0, 0, 0, 0, 0, 0, 0));
@@ -2139,28 +2140,28 @@ fn _mm_sll_epi16() {
        assert_eq!(r, i16x8::new(0xFF, 0, 0, 0, 0, 0, 0, 0));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_slli_epi32() {
        assert_eq!(
            sse2::_mm_slli_epi32(i32x4::splat(0xFFFF), 4),
            i32x4::splat(0xFFFF0));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_sll_epi32() {
        assert_eq!(
            sse2::_mm_sll_epi32(i32x4::splat(0xFFFF), i32x4::new(4, 0, 0, 0)),
            i32x4::splat(0xFFFF0));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_slli_epi64() {
        assert_eq!(
            sse2::_mm_slli_epi64(i64x2::splat(0xFFFFFFFF), 4),
            i64x2::splat(0xFFFFFFFF0));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_sll_epi64() {
        assert_eq!(
            sse2::_mm_sll_epi64(
@@ -2168,13 +2169,13 @@ fn _mm_sll_epi64() {
            i64x2::splat(0xFFFFFFFF0));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_srai_epi16() {
        assert_eq!(
            sse2::_mm_srai_epi16(i16x8::splat(-1), 1), i16x8::splat(-1));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_sra_epi16() {
        assert_eq!(
            sse2::_mm_sra_epi16(
@@ -2182,13 +2183,13 @@ fn _mm_sra_epi16() {
            i16x8::splat(-1));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_srai_epi32() {
        assert_eq!(
            sse2::_mm_srai_epi32(i32x4::splat(-1), 1), i32x4::splat(-1));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_sra_epi32() {
        assert_eq!(
            sse2::_mm_sra_epi32(
@@ -2196,7 +2197,7 @@ fn _mm_sra_epi32() {
            i32x4::splat(-1));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_srli_si128() {
        let a = __m128i::new(
            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
@@ -2228,7 +2229,7 @@ fn _mm_srli_si128() {
        assert_eq!(r, __m128i::splat(0));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_srli_epi16() {
        let a = i16x8::new(
            0xFFFF as u16 as i16, 0x0FFF, 0x00FF, 0x000F, 0, 0, 0, 0);
@@ -2239,7 +2240,7 @@ fn _mm_srli_epi16() {
        assert_eq!(r, e);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_srl_epi16() {
        let a = i16x8::new(0xFF, 0, 0, 0, 0, 0, 0, 0);
        let r = sse2::_mm_srl_epi16(a, i16x8::new(4, 0, 0, 0, 0, 0, 0, 0));
@@ -2248,28 +2249,28 @@ fn _mm_srl_epi16() {
        assert_eq!(r, i16x8::new(0xFF, 0, 0, 0, 0, 0, 0, 0));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_srli_epi32() {
        assert_eq!(
            sse2::_mm_srli_epi32(i32x4::splat(0xFFFF), 4),
            i32x4::splat(0xFFF));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_srl_epi32() {
        assert_eq!(
            sse2::_mm_srl_epi32(i32x4::splat(0xFFFF), i32x4::new(4, 0, 0, 0)),
            i32x4::splat(0xFFF));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_srli_epi64() {
        assert_eq!(
            sse2::_mm_srli_epi64(i64x2::splat(0xFFFFFFFF), 4),
            i64x2::splat(0xFFFFFFF));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_srl_epi64() {
        assert_eq!(
            sse2::_mm_srl_epi64(
@@ -2277,35 +2278,35 @@ fn _mm_srl_epi64() {
            i64x2::splat(0xFFFFFFF));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_and_si128() {
        assert_eq!(
            sse2::_mm_and_si128(__m128i::splat(5), __m128i::splat(3)),
            __m128i::splat(1));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_andnot_si128() {
        assert_eq!(
            sse2::_mm_andnot_si128(__m128i::splat(5), __m128i::splat(3)),
            __m128i::splat(2));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_or_si128() {
        assert_eq!(
            sse2::_mm_or_si128(__m128i::splat(5), __m128i::splat(3)),
            __m128i::splat(7));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_xor_si128() {
        assert_eq!(
            sse2::_mm_xor_si128(__m128i::splat(5), __m128i::splat(3)),
            __m128i::splat(6));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_cmpeq_epi8() {
        let a = i8x16::new(
            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
@@ -2316,7 +2317,7 @@ fn _mm_cmpeq_epi8() {
            0, 0, 0xFFu8 as i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_cmpeq_epi16() {
        let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
        let b = i16x8::new(7, 6, 2, 4, 3, 2, 1, 0);
@@ -2324,7 +2325,7 @@ fn _mm_cmpeq_epi16() {
        assert_eq!(r, i16x8::splat(0).replace(2, 0xFFFFu16 as i16));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_cmpeq_epi32() {
        let a = i32x4::new(0, 1, 2, 3);
        let b = i32x4::new(3, 2, 2, 0);
@@ -2332,7 +2333,7 @@ fn _mm_cmpeq_epi32() {
        assert_eq!(r, i32x4::splat(0).replace(2, 0xFFFFFFFFu32 as i32));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_cmpgt_epi8() {
        let a = i8x16::splat(0).replace(0, 5);
        let b = i8x16::splat(0);
@@ -2340,7 +2341,7 @@ fn _mm_cmpgt_epi8() {
        assert_eq!(r, i8x16::splat(0).replace(0, 0xFFu8 as i8));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_cmpgt_epi16() {
        let a = i16x8::splat(0).replace(0, 5);
        let b = i16x8::splat(0);
@@ -2348,7 +2349,7 @@ fn _mm_cmpgt_epi16() {
        assert_eq!(r, i16x8::splat(0).replace(0, 0xFFFFu16 as i16));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_cmpgt_epi32() {
        let a = i32x4::splat(0).replace(0, 5);
        let b = i32x4::splat(0);
@@ -2356,7 +2357,7 @@ fn _mm_cmpgt_epi32() {
        assert_eq!(r, i32x4::splat(0).replace(0, 0xFFFFFFFFu32 as i32));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_cmplt_epi8() {
        let a = i8x16::splat(0);
        let b = i8x16::splat(0).replace(0, 5);
@@ -2364,7 +2365,7 @@ fn _mm_cmplt_epi8() {
        assert_eq!(r, i8x16::splat(0).replace(0, 0xFFu8 as i8));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_cmplt_epi16() {
        let a = i16x8::splat(0);
        let b = i16x8::splat(0).replace(0, 5);
@@ -2372,7 +2373,7 @@ fn _mm_cmplt_epi16() {
        assert_eq!(r, i16x8::splat(0).replace(0, 0xFFFFu16 as i16));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_cmplt_epi32() {
        let a = i32x4::splat(0);
        let b = i32x4::splat(0).replace(0, 5);
@@ -2380,69 +2381,69 @@ fn _mm_cmplt_epi32() {
        assert_eq!(r, i32x4::splat(0).replace(0, 0xFFFFFFFFu32 as i32));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_cvtepi32_pd() {
        let a = sse2::_mm_set_epi32(35, 25, 15, 5);
        let r = sse2::_mm_cvtepi32_pd(a);
        assert_eq!(r, f64x2::new(5.0, 15.0));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_cvtsi32_sd() {
        let a = f64x2::splat(3.5);
        assert_eq!(sse2::_mm_cvtsi32_sd(a, 5), f64x2::new(5.0, 3.5));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_cvtsi64_sd() {
        let a = f64x2::splat(3.5);
        assert_eq!(sse2::_mm_cvtsi64_sd(a, 5), f64x2::new(5.0, 3.5));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_cvtepi32_ps() {
        let a = i32x4::new(1, 2, 3, 4);
        assert_eq!(sse2::_mm_cvtepi32_ps(a), f32x4::new(1.0, 2.0, 3.0, 4.0));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_cvtsi32_si128() {
        assert_eq!(sse2::_mm_cvtsi32_si128(5), i32x4::new(5, 0, 0, 0));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_cvtsi64_si128() {
        assert_eq!(sse2::_mm_cvtsi64_si128(5), i64x2::new(5, 0));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_cvtsi128_si32() {
        assert_eq!(sse2::_mm_cvtsi128_si32(i32x4::new(5, 0, 0, 0)), 5);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_cvtsi128_si64() {
        assert_eq!(sse2::_mm_cvtsi128_si64(i64x2::new(5, 0)), 5);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_set_epi64x() {
        assert_eq!(sse2::_mm_set_epi64x(0, 1), i64x2::new(1, 0));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_set_epi32() {
        assert_eq!(sse2::_mm_set_epi32(0, 1, 2, 3), i32x4::new(3, 2, 1, 0));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_set_epi16() {
        assert_eq!(
            sse2::_mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7),
            i16x8::new(7, 6, 5, 4, 3, 2, 1, 0));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_set_epi8() {
        assert_eq!(
            sse2::_mm_set_epi8(
@@ -2450,39 +2451,39 @@ fn _mm_set_epi8() {
            i8x16::new(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_set1_epi64x() {
        assert_eq!(sse2::_mm_set1_epi64x(1), i64x2::splat(1));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_set1_epi32() {
        assert_eq!(sse2::_mm_set1_epi32(1), i32x4::splat(1));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_set1_epi16() {
        assert_eq!(sse2::_mm_set1_epi16(1), i16x8::splat(1));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_set1_epi8() {
        assert_eq!(sse2::_mm_set1_epi8(1), i8x16::splat(1));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_setr_epi32() {
        assert_eq!(sse2::_mm_setr_epi32(0, 1, 2, 3), i32x4::new(0, 1, 2, 3));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_setr_epi16() {
        assert_eq!(
            sse2::_mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7),
            i16x8::new(0, 1, 2, 3, 4, 5, 6, 7));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_setr_epi8() {
        assert_eq!(
            sse2::_mm_setr_epi8(
@@ -2490,33 +2491,33 @@ fn _mm_setr_epi8() {
            i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_setzero_si128() {
        assert_eq!(sse2::_mm_setzero_si128(), __m128i::from(i64x2::splat(0)));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_loadl_epi64() {
        let a = i64x2::new(6, 5);
        let r = unsafe { sse2::_mm_loadl_epi64(&a as *const _) };
        assert_eq!(r, i64x2::new(6, 0));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_load_si128() {
        let a = sse2::_mm_set_epi64x(5, 6);
        let r = unsafe { sse2::_mm_load_si128(&a as *const _ as *const _) };
        assert_eq!(a, i64x2::from(r));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_loadu_si128() {
        let a = sse2::_mm_set_epi64x(5, 6);
        let r = unsafe { sse2::_mm_loadu_si128(&a as *const _ as *const _) };
        assert_eq!(a, i64x2::from(r));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_maskmoveu_si128() {
        let a = i8x16::splat(9);
        let mask = i8x16::splat(0).replace(2, 0x80u8 as i8);
@@ -2527,7 +2528,7 @@ fn _mm_maskmoveu_si128() {
        assert_eq!(r, i8x16::splat(0).replace(2, 9));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_store_si128() {
        let a = __m128i::splat(9);
        let mut r = __m128i::splat(0);
@@ -2537,7 +2538,7 @@ fn _mm_store_si128() {
        assert_eq!(r, a);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_storeu_si128() {
        let a = __m128i::splat(9);
        let mut r = __m128i::splat(0);
@@ -2547,7 +2548,7 @@ fn _mm_storeu_si128() {
        assert_eq!(r, a);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_storel_epi64() {
        let a = __m128i::from(i64x2::new(2, 9));
        let mut r = __m128i::splat(0);
@@ -2557,13 +2558,13 @@ fn _mm_storel_epi64() {
        assert_eq!(r, __m128i::from(i64x2::new(2, 0)));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_move_epi64() {
        let a = i64x2::new(5, 6);
        assert_eq!(sse2::_mm_move_epi64(a), i64x2::new(5, 0));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_packs_epi16() {
        let a = i16x8::new(0x80, -0x81, 0, 0, 0, 0, 0, 0);
        let b = i16x8::new(0, 0, 0, 0, 0, 0, -0x81, 0x80);
@@ -2573,7 +2574,7 @@ fn _mm_packs_epi16() {
            0, 0, 0, 0, 0, 0, -0x80, 0x7F));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_packs_epi32() {
        let a = i32x4::new(0x8000, -0x8001, 0, 0);
        let b = i32x4::new(0, 0, -0x8001, 0x8000);
@@ -2582,7 +2583,7 @@ fn _mm_packs_epi32() {
            r, i16x8::new(0x7FFF, -0x8000, 0, 0, 0, 0, -0x8000, 0x7FFF));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_packus_epi16() {
        let a = i16x8::new(0x100, -1, 0, 0, 0, 0, 0, 0);
        let b = i16x8::new(0, 0, 0, 0, 0, 0, -1, 0x100);
@@ -2592,19 +2593,19 @@ fn _mm_packus_epi16() {
            0, 0, 0, 0, 0, 0, 0, 0xFF));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_extract_epi16() {
        let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
        assert_eq!(sse2::_mm_extract_epi16(a, 5), 5);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_insert_epi16() {
        let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
        assert_eq!(sse2::_mm_insert_epi16(a, 9, 0), a.replace(0, 9));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_movemask_epi8() {
        let a = i8x16::from(u8x16::new(
            0b1000_0000, 0b0, 0b1000_0000, 0b01, 0b0101, 0b1111_0000, 0, 0,
@@ -2612,28 +2613,28 @@ fn _mm_movemask_epi8() {
        assert_eq!(sse2::_mm_movemask_epi8(a), 0b10100100_00100101);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_shuffle_epi32() {
        let a = i32x4::new(5, 10, 15, 20);
        let e = i32x4::new(20, 10, 10, 5);
        assert_eq!(sse2::_mm_shuffle_epi32(a, 0b00_01_01_11), e);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_shufflehi_epi16() {
        let a = i16x8::new(1, 2, 3, 4, 5, 10, 15, 20);
        let e = i16x8::new(1, 2, 3, 4, 20, 10, 10, 5);
        assert_eq!(sse2::_mm_shufflehi_epi16(a, 0b00_01_01_11), e);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_shufflelo_epi16() {
        let a = i16x8::new(5, 10, 15, 20, 1, 2, 3, 4);
        let e = i16x8::new(20, 10, 10, 5, 1, 2, 3, 4);
        assert_eq!(sse2::_mm_shufflelo_epi16(a, 0b00_01_01_11), e);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_unpackhi_epi8() {
        let a = i8x16::new(
            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
@@ -2644,7 +2645,7 @@ fn _mm_unpackhi_epi8() {
        assert_eq!(sse2::_mm_unpackhi_epi8(a, b), e);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_unpackhi_epi16() {
        let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
        let b = i16x8::new(8, 9, 10, 11, 12, 13, 14, 15);
@@ -2652,7 +2653,7 @@ fn _mm_unpackhi_epi16() {
        assert_eq!(sse2::_mm_unpackhi_epi16(a, b), e);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_unpackhi_epi32() {
        let a = i32x4::new(0, 1, 2, 3);
        let b = i32x4::new(4, 5, 6, 7);
@@ -2660,7 +2661,7 @@ fn _mm_unpackhi_epi32() {
        assert_eq!(sse2::_mm_unpackhi_epi32(a, b), e);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_unpackhi_epi64() {
        let a = i64x2::new(0, 1);
        let b = i64x2::new(2, 3);
@@ -2668,7 +2669,7 @@ fn _mm_unpackhi_epi64() {
        assert_eq!(sse2::_mm_unpackhi_epi64(a, b), e);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_unpacklo_epi8() {
        let a = i8x16::new(
            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
@@ -2679,7 +2680,7 @@ fn _mm_unpacklo_epi8() {
        assert_eq!(sse2::_mm_unpacklo_epi8(a, b), e);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_unpacklo_epi16() {
        let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
        let b = i16x8::new(8, 9, 10, 11, 12, 13, 14, 15);
@@ -2687,7 +2688,7 @@ fn _mm_unpacklo_epi16() {
        assert_eq!(sse2::_mm_unpacklo_epi16(a, b), e);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_unpacklo_epi32() {
        let a = i32x4::new(0, 1, 2, 3);
        let b = i32x4::new(4, 5, 6, 7);
@@ -2695,7 +2696,7 @@ fn _mm_unpacklo_epi32() {
        assert_eq!(sse2::_mm_unpacklo_epi32(a, b), e);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_unpacklo_epi64() {
        let a = i64x2::new(0, 1);
        let b = i64x2::new(2, 3);
@@ -2703,105 +2704,105 @@ fn _mm_unpacklo_epi64() {
        assert_eq!(sse2::_mm_unpacklo_epi64(a, b), e);
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_add_sd() {
        assert_eq!(
            sse2::_mm_add_sd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)),
            f64x2::new(6.0, 2.0));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_add_pd() {
        assert_eq!(
            sse2::_mm_add_pd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)),
            f64x2::new(6.0, 12.0));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_div_sd() {
        assert_eq!(
            sse2::_mm_div_sd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)),
            f64x2::new(0.2, 2.0));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_div_pd() {
        assert_eq!(
            sse2::_mm_div_pd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)),
            f64x2::new(0.2, 0.2));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_max_sd() {
        assert_eq!(
            sse2::_mm_max_sd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)),
            f64x2::new(5.0, 2.0));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_max_pd() {
        assert_eq!(
            sse2::_mm_max_pd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)),
            f64x2::new(5.0, 10.0));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_min_sd() {
        assert_eq!(
            sse2::_mm_min_sd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)),
            f64x2::new(1.0, 2.0));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_min_pd() {
        assert_eq!(
            sse2::_mm_min_pd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)),
            f64x2::new(1.0, 2.0));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_mul_sd() {
        assert_eq!(
            sse2::_mm_mul_sd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)),
            f64x2::new(5.0, 2.0));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_mul_pd() {
        assert_eq!(
            sse2::_mm_mul_pd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)),
            f64x2::new(5.0, 20.0));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_sqrt_sd() {
        assert_eq!(
            sse2::_mm_sqrt_sd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)),
            f64x2::new(5.0f64.sqrt(), 2.0));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_sqrt_pd() {
        assert_eq!(
            sse2::_mm_sqrt_pd(f64x2::new(1.0, 2.0)),
            f64x2::new(1.0f64.sqrt(), 2.0f64.sqrt()));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_sub_sd() {
        assert_eq!(
            sse2::_mm_sub_sd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)),
            f64x2::new(-4.0, 2.0));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_sub_pd() {
        assert_eq!(
            sse2::_mm_sub_pd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)),
            f64x2::new(-4.0, -8.0));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_and_pd() {
        use std::mem::transmute;

@@ -2813,7 +2814,7 @@ fn _mm_and_pd() {
        }
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_andnot_pd() {
        use std::mem::transmute;

@@ -2825,7 +2826,7 @@ fn _mm_andnot_pd() {
        }
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_or_pd() {
        use std::mem::transmute;

@@ -2837,7 +2838,7 @@ fn _mm_or_pd() {
        }
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_xor_pd() {
        use std::mem::transmute;

@@ -2849,7 +2850,7 @@ fn _mm_xor_pd() {
        }
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_cmpeq_sd() {
        use std::mem::transmute;

@@ -2861,7 +2862,7 @@ fn _mm_cmpeq_sd() {
        }
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_cmplt_sd() {
        use std::mem::transmute;

@@ -2873,7 +2874,7 @@ fn _mm_cmplt_sd() {
        }
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_cmple_sd() {
        use std::mem::transmute;

@@ -2885,7 +2886,7 @@ fn _mm_cmple_sd() {
        }
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_cmpgt_sd() {
        use std::mem::transmute;

@@ -2897,7 +2898,7 @@ fn _mm_cmpgt_sd() {
        }
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_cmpge_sd() {
        use std::mem::transmute;

@@ -2909,7 +2910,7 @@ fn _mm_cmpge_sd() {
        }
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_cmpord_sd() {
        use std::f64::NAN;
        use std::mem::transmute;
@@ -2922,7 +2923,7 @@ fn _mm_cmpord_sd() {
        }
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_cmpunord_sd() {
        use std::f64::NAN;
        use std::mem::transmute;
@@ -2935,7 +2936,7 @@ fn _mm_cmpunord_sd() {
        }
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_cmpneq_sd() {
        use std::mem::transmute;

@@ -2947,7 +2948,7 @@ fn _mm_cmpneq_sd() {
        }
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_cmpnlt_sd() {
        use std::mem::transmute;

@@ -2959,7 +2960,7 @@ fn _mm_cmpnlt_sd() {
        }
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_cmpnle_sd() {
        use std::mem::transmute;

@@ -2971,7 +2972,7 @@ fn _mm_cmpnle_sd() {
        }
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_cmpngt_sd() {
        use std::mem::transmute;

@@ -2983,7 +2984,7 @@ fn _mm_cmpngt_sd() {
        }
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_cmpnge_sd() {
        use std::mem::transmute;

@@ -2995,7 +2996,7 @@ fn _mm_cmpnge_sd() {
        }
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_cmpeq_pd() {
        use std::mem::transmute;

@@ -3007,7 +3008,7 @@ fn _mm_cmpeq_pd() {
        }
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_cmplt_pd() {
        use std::mem::transmute;

@@ -3019,7 +3020,7 @@ fn _mm_cmplt_pd() {
        }
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_cmple_pd() {
        use std::mem::transmute;

@@ -3031,7 +3032,7 @@ fn _mm_cmple_pd() {
        }
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_cmpgt_pd() {
        use std::mem::transmute;

@@ -3043,7 +3044,7 @@ fn _mm_cmpgt_pd() {
        }
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_cmpge_pd() {
        use std::mem::transmute;

@@ -3055,7 +3056,7 @@ fn _mm_cmpge_pd() {
        }
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_cmpord_pd() {
        use std::f64::NAN;
        use std::mem::transmute;
@@ -3068,7 +3069,7 @@ fn _mm_cmpord_pd() {
        }
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_cmpunord_pd() {
        use std::f64::NAN;
        use std::mem::transmute;
@@ -3081,7 +3082,7 @@ fn _mm_cmpunord_pd() {
        }
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_cmpneq_pd() {
        use std::mem::transmute;

@@ -3093,7 +3094,7 @@ fn _mm_cmpneq_pd() {
        }
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_cmpnlt_pd() {
        use std::mem::transmute;

@@ -3105,7 +3106,7 @@ fn _mm_cmpnlt_pd() {
        }
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_cmpnle_pd() {
        use std::mem::transmute;

@@ -3117,7 +3118,7 @@ fn _mm_cmpnle_pd() {
        }
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_cmpngt_pd() {
        use std::mem::transmute;

@@ -3129,7 +3130,7 @@ fn _mm_cmpngt_pd() {
        }
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_cmpnge_pd() {
        use std::mem::transmute;

@@ -3141,7 +3142,7 @@ fn _mm_cmpnge_pd() {
        }
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_comieq_sd() {
        use std::f64::NAN;

@@ -3152,37 +3153,37 @@ fn _mm_comieq_sd() {
        assert!(!sse2::_mm_comieq_sd(a, b));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_comilt_sd() {
        let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0));
        assert!(!sse2::_mm_comilt_sd(a, b));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_comile_sd() {
        let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0));
        assert!(sse2::_mm_comile_sd(a, b));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_comigt_sd() {
        let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0));
        assert!(!sse2::_mm_comigt_sd(a, b));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_comige_sd() {
        let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0));
        assert!(sse2::_mm_comige_sd(a, b));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_comineq_sd() {
        let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0));
        assert!(!sse2::_mm_comineq_sd(a, b));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_ucomieq_sd() {
        use std::f64::NAN;

@@ -3193,37 +3194,37 @@ fn _mm_ucomieq_sd() {
        assert!(!sse2::_mm_ucomieq_sd(a, b));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_ucomilt_sd() {
        let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0));
        assert!(!sse2::_mm_ucomilt_sd(a, b));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_ucomile_sd() {
        let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0));
        assert!(sse2::_mm_ucomile_sd(a, b));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_ucomigt_sd() {
        let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0));
        assert!(!sse2::_mm_ucomigt_sd(a, b));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_ucomige_sd() {
        let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0));
        assert!(sse2::_mm_ucomige_sd(a, b));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_ucomineq_sd() {
        let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0));
        assert!(!sse2::_mm_ucomineq_sd(a, b));
    }

-    #[test]
+    #[simd_test = "sse2"]
    fn _mm_movemask_pd() {
        let r = sse2::_mm_movemask_pd(f64x2::new(-1.0, 5.0));
        assert_eq!(r, 0b01);
@@ -1,8 +1,12 @@
 use v128::*;
 use x86::__m128i;

+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
 #[inline(always)]
 #[target_feature = "+sse4.1"]
+#[cfg_attr(test, assert_instr(pblendvb))]
 pub fn _mm_blendv_epi8(
    a: __m128i,
    b: __m128i,
@@ -57,13 +61,14 @@ macro_rules! call {
    fn dpps(a: f32x4, b: f32x4, imm8: u8) -> f32x4;
 }

-#[cfg(all(test, target_feature = "sse4.1", any(target_arch = "x86", target_arch = "x86_64")))]
+#[cfg(test)]
 mod tests {
+    use stdsimd_test::simd_test;
+
    use v128::*;
    use x86::sse41;

-    #[test]
-    #[target_feature = "+sse4.1"]
+    #[simd_test = "sse4.1"]
    fn _mm_blendv_epi8() {
        let a = i8x16::new(
            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
@@ -76,8 +81,7 @@ fn _mm_blendv_epi8() {
        assert_eq!(sse41::_mm_blendv_epi8(a, b, mask), e);
    }

-    #[test]
-    #[target_feature = "+sse4.1"]
+    #[simd_test = "sse4.1"]
    fn _mm_dp_pd() {
        let a = f64x2::new(2.0, 3.0);
        let b = f64x2::new(1.0, 4.0);
@@ -85,8 +89,7 @@ fn _mm_dp_pd() {
        assert_eq!(sse41::_mm_dp_pd(a, b, 0b00110001), e);
    }

-    #[test]
-    #[target_feature = "+sse4.1"]
+    #[simd_test = "sse4.1"]
    fn _mm_dp_ps() {
        let a = f32x4::new(2.0, 3.0, 1.0, 10.0);
        let b = f32x4::new(1.0, 4.0, 0.5, 10.0);
@@ -40,13 +40,14 @@ macro_rules! call {
    fn pcmpestri128(a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8) -> i32;
 }

-#[cfg(all(test, target_feature = "sse4.2", any(target_arch = "x86", target_arch = "x86_64")))]
+#[cfg(test)]
 mod tests {
+    use stdsimd_test::simd_test;
+
    use v128::*;
    use x86::{__m128i, sse42};

-    #[test]
-    #[target_feature = "+sse4.2"]
+    #[simd_test = "sse4.2"]
    fn _mm_cmpestri() {
        let a = &b"bar             "[..];
        let b = &b"foobar          "[..];
@@ -1,15 +1,17 @@
 use v128::*;

+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
 /// Compute the absolute value of packed 8-bit signed integers in `a` and
 /// return the unsigned results.
 #[inline(always)]
 #[target_feature = "+ssse3"]
+#[cfg_attr(test, assert_instr(pabsb))]
 pub fn _mm_abs_epi8(a: i8x16) -> u8x16 {
    unsafe { pabsb128(a) }
 }

-
-
 /// Shuffle bytes from `a` according to the content of `b`.
 ///
 /// The last 4 bits of each byte of `b` are used as addresses
@@ -36,6 +38,7 @@ pub fn _mm_abs_epi8(a: i8x16) -> u8x16 {
 /// ```
 #[inline(always)]
 #[target_feature = "+ssse3"]
+#[cfg_attr(test, assert_instr(pshufb))]
 pub fn _mm_shuffle_epi8(a: u8x16, b: u8x16) -> u8x16 {
    unsafe { pshufb128(a, b) }
 }
@@ -50,20 +53,20 @@ pub fn _mm_shuffle_epi8(a: u8x16, b: u8x16) -> u8x16 {
    fn pshufb128(a: u8x16, b: u8x16) -> u8x16;
 }

-#[cfg(all(test, target_feature = "ssse3", any(target_arch = "x86", target_arch = "x86_64")))]
+#[cfg(test)]
 mod tests {
+    use stdsimd_test::simd_test;
+
    use v128::*;
    use x86::ssse3 as ssse3;

-    #[test]
-    #[target_feature = "+ssse3"]
+    #[simd_test = "ssse3"]
    fn _mm_abs_epi8() {
        let r = ssse3::_mm_abs_epi8(i8x16::splat(-5));
        assert_eq!(r, u8x16::splat(5));
    }

-    #[test]
-    #[target_feature = "+ssse3"]
+    #[simd_test = "ssse3"]
    fn _mm_shuffle_epi8() {
        let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
        let b = u8x16::new(4, 128, 4, 3, 24, 12, 6, 19, 12, 5, 5, 10, 4, 1, 8, 0);
@@ -8,7 +8,7 @@
 //! provides a quick overview of the available instructions.

 #[cfg(test)]
-use assert_instr::assert_instr;
+use stdsimd_test::assert_instr;

 // TODO: LLVM-CODEGEN ERROR: LLVM ERROR: Cannot select: intrinsic %llvm.x86.tbm.bextri.u32
 /*
@@ -252,40 +252,38 @@ pub fn _tzmsk_u64(x: u64) -> u64 {
    !x & (x.wrapping_sub(1))
 }

-#[cfg(all(test, target_feature = "tbm", any(target_arch = "x86", target_arch = "x86_64")))]
+#[cfg(test)]
 mod tests {
+    use stdsimd_test::simd_test;
+
    use x86::tbm;

    /*
-    #[test]
-    #[target_feature = "+tbm"]
+    #[simd_test = "tbm"]
    fn _bextr_u32() {
        assert_eq!(tbm::_bextr_u32(0b0101_0000u32, 4, 4), 0b0000_0101u32);
    }

-    #[test]
-    #[target_feature = "+tbm"]
+    #[simd_test = "tbm"]
    fn _bextr_u64() {
        assert_eq!(tbm::_bextr_u64(0b0101_0000u64, 4, 4), 0b0000_0101u64);
    }
    */

-    #[test]
-    #[target_feature = "+tbm"]
+    #[simd_test = "tbm"]
    fn _blcfill_u32() {
        assert_eq!(tbm::_blcfill_u32(0b0101_0111u32), 0b0101_0000u32);
        assert_eq!(tbm::_blcfill_u32(0b1111_1111u32), 0u32);
    }

-    #[test]
-    #[target_feature = "+tbm"]
+    #[simd_test = "tbm"]
+    #[cfg(not(target_arch = "x86"))]
    fn _blcfill_u64() {
        assert_eq!(tbm::_blcfill_u64(0b0101_0111u64), 0b0101_0000u64);
        assert_eq!(tbm::_blcfill_u64(0b1111_1111u64), 0u64);
    }

-    #[test]
-    #[target_feature = "+tbm"]
+    #[simd_test = "tbm"]
    fn _blci_u32() {
        assert_eq!(tbm::_blci_u32(0b0101_0000u32),
                   0b1111_1111_1111_1111_1111_1111_1111_1110u32);
@@ -293,8 +291,8 @@ fn _blci_u32() {
                   0b1111_1111_1111_1111_1111_1110_1111_1111u32);
    }

-    #[test]
-    #[target_feature = "+tbm"]
+    #[simd_test = "tbm"]
+    #[cfg(not(target_arch = "x86"))]
    fn _blci_u64() {
        assert_eq!(tbm::_blci_u64(0b0101_0000u64),
                   0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110u64);
@@ -302,99 +300,92 @@ fn _blci_u64() {
                   0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110_1111_1111u64);
    }

-    #[test]
-    #[target_feature = "+tbm"]
+    #[simd_test = "tbm"]
    fn _blcic_u32() {
        assert_eq!(tbm::_blcic_u32(0b0101_0001u32), 0b0000_0010u32);
        assert_eq!(tbm::_blcic_u32(0b1111_1111u32), 0b1_0000_0000u32);
    }

-    #[test]
-    #[target_feature = "+tbm"]
+    #[simd_test = "tbm"]
+    #[cfg(not(target_arch = "x86"))]
    fn _blcic_u64() {
        assert_eq!(tbm::_blcic_u64(0b0101_0001u64), 0b0000_0010u64);
        assert_eq!(tbm::_blcic_u64(0b1111_1111u64), 0b1_0000_0000u64);
    }

-    #[test]
-    #[target_feature = "+tbm"]
+    #[simd_test = "tbm"]
    fn _blcmsk_u32() {
        assert_eq!(tbm::_blcmsk_u32(0b0101_0001u32), 0b0000_0011u32);
        assert_eq!(tbm::_blcmsk_u32(0b1111_1111u32), 0b1_1111_1111u32);
    }

-    #[test]
-    #[target_feature = "+tbm"]
+    #[simd_test = "tbm"]
+    #[cfg(not(target_arch = "x86"))]
    fn _blcmsk_u64() {
        assert_eq!(tbm::_blcmsk_u64(0b0101_0001u64), 0b0000_0011u64);
        assert_eq!(tbm::_blcmsk_u64(0b1111_1111u64), 0b1_1111_1111u64);
    }

-    #[test]
-    #[target_feature = "+tbm"]
+    #[simd_test = "tbm"]
    fn _blcs_u32() {
       assert_eq!(tbm::_blcs_u32(0b0101_0001u32), 0b0101_0011u32);
       assert_eq!(tbm::_blcs_u32(0b1111_1111u32), 0b1_1111_1111u32);
    }

-    #[test]
-    #[target_feature = "+tbm"]
+    #[simd_test = "tbm"]
+    #[cfg(not(target_arch = "x86"))]
    fn _blcs_u64() {
       assert_eq!(tbm::_blcs_u64(0b0101_0001u64), 0b0101_0011u64);
       assert_eq!(tbm::_blcs_u64(0b1111_1111u64), 0b1_1111_1111u64);
    }

-    #[test]
-    #[target_feature = "+tbm"]
+    #[simd_test = "tbm"]
    fn _blsfill_u32() {
        assert_eq!(tbm::_blsfill_u32(0b0101_0100u32), 0b0101_0111u32);
        assert_eq!(tbm::_blsfill_u32(0u32), 0b1111_1111_1111_1111_1111_1111_1111_1111u32);
    }

-    #[test]
-    #[target_feature = "+tbm"]
+    #[simd_test = "tbm"]
+    #[cfg(not(target_arch = "x86"))]
    fn _blsfill_u64() {
        assert_eq!(tbm::_blsfill_u64(0b0101_0100u64), 0b0101_0111u64);
        assert_eq!(tbm::_blsfill_u64(0u64), 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64);
    }

-    #[test]
-    #[target_feature = "+tbm"]
+    #[simd_test = "tbm"]
    fn _blsic_u32() {
        assert_eq!(tbm::_blsic_u32(0b0101_0100u32), 0b1111_1111_1111_1111_1111_1111_1111_1011u32);
        assert_eq!(tbm::_blsic_u32(0u32), 0b1111_1111_1111_1111_1111_1111_1111_1111u32);
    }

-    #[test]
-    #[target_feature = "+tbm"]
+    #[simd_test = "tbm"]
+    #[cfg(not(target_arch = "x86"))]
    fn _blsic_u64() {
        assert_eq!(tbm::_blsic_u64(0b0101_0100u64), 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1011u64);
       assert_eq!(tbm::_blsic_u64(0u64), 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64);
    }

-    #[test]
-    #[target_feature = "+tbm"]
+    #[simd_test = "tbm"]
    fn _t1mskc_u32() {
       assert_eq!(tbm::_t1mskc_u32(0b0101_0111u32), 0b1111_1111_1111_1111_1111_1111_1111_1000u32);
       assert_eq!(tbm::_t1mskc_u32(0u32), 0b1111_1111_1111_1111_1111_1111_1111_1111u32);
    }

-    #[test]
-    #[target_feature = "+tbm"]
+    #[simd_test = "tbm"]
+    #[cfg(not(target_arch = "x86"))]
    fn _t1mksc_u64() {
       assert_eq!(tbm::_t1mskc_u64(0b0101_0111u64), 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1000u64);
       assert_eq!(tbm::_t1mskc_u64(0u64), 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64);
    }

-    #[test]
-    #[target_feature = "+tbm"]
+    #[simd_test = "tbm"]
    fn _tzmsk_u32() {
        assert_eq!(tbm::_tzmsk_u32(0b0101_1000u32), 0b0000_0111u32);
        assert_eq!(tbm::_tzmsk_u32(0b0101_1001u32), 0b0000_0000u32);
    }

-    #[test]
-    #[target_feature = "+tbm"]
+    #[simd_test = "tbm"]
+    #[cfg(not(target_arch = "x86"))]
    fn _tzmsk_u64() {
        assert_eq!(tbm::_tzmsk_u64(0b0101_1000u64), 0b0000_0111u64);
        assert_eq!(tbm::_tzmsk_u64(0b0101_1001u64), 0b0000_0000u64);
@@ -1,10 +1,11 @@
 [package]
-name = "assert-instr"
+name = "stdsimd-test"
 version = "0.1.0"
 authors = ["Alex Crichton <alex@alexcrichton.com>"]

 [dependencies]
 assert-instr-macro = { path = "assert-instr-macro" }
+simd-test-macro = { path = "simd-test-macro" }
 backtrace = "0.3"
 cc = "1.0"
 lazy_static = "0.2"
@@ -44,7 +44,7 @@ pub fn assert_instr(attr: TokenStream, item: TokenStream) -> TokenStream {
        #[allow(non_snake_case)]
        {ignore}
        fn assert_instr_{name}() {{
-            ::assert_instr::assert({name} as usize,
+            ::stdsimd_test::assert({name} as usize,
                                   \"{name}\",
                                   \"{instr}\");
        }}
@@ -0,0 +1,11 @@
+[package]
+name = "simd-test-macro"
+version = "0.1.0"
+authors = ["Alex Crichton <alex@alexcrichton.com>"]
+
+[lib]
+proc-macro = true
+
+[dependencies]
+proc-macro2 = { version = "0.1", features = ["unstable"] }
+quote = { git = 'https://github.com/dtolnay/quote' }
@@ -0,0 +1,76 @@
+//! Implementation of the `#[simd_test]` macro
+//!
+//! This macro expands to a `#[test]` function which tests the local machine for
+//! the appropriate cfg before calling the inner test function.
+
+#![feature(proc_macro)]
+
+#[macro_use]
+extern crate quote;
+extern crate proc_macro;
+extern crate proc_macro2;
+
+use proc_macro2::{TokenStream, Term, TokenNode, TokenTree};
+use proc_macro2::Literal;
+
+fn string(s: &str) -> TokenTree {
+    TokenTree {
+        kind: TokenNode::Literal(Literal::string(s)),
+
+        span: Default::default(),
+    }
+}
+
+#[proc_macro_attribute]
+pub fn simd_test(attr: proc_macro::TokenStream,
+                 item: proc_macro::TokenStream) -> proc_macro::TokenStream {
+    let tokens = TokenStream::from(attr).into_iter().collect::<Vec<_>>();
+    if tokens.len() != 2 {
+        panic!("expected #[simd_test = \"feature\"]");
+    }
+    match tokens[0].kind {
+        TokenNode::Op('=', _) => {}
+        _ => panic!("expected #[simd_test = \"feature\"]"),
+    }
+    let target_feature = &tokens[1];
+    let enable_feature = match tokens[1].kind {
+        TokenNode::Literal(ref l) => l.to_string(),
+        _ => panic!("expected #[simd_test = \"feature\"]"),
+    };
+    let enable_feature = enable_feature.trim_left_matches('"')
+                                       .trim_right_matches('"');
+    let enable_feature = string(&format!("+{}", enable_feature));
+    let item = TokenStream::from(item);
+    let name = find_name(item.clone());
+
+    let name: TokenStream = name.as_str().parse().unwrap();
+
+    let ret: TokenStream = quote! {
+        #[test]
+        fn #name() {
+            if cfg_feature_enabled!(#target_feature) {
+                return #name();
+            }
+
+            #[target_feature = #enable_feature]
+            #item
+        }
+    }.into();
+    ret.into()
+}
+
+fn find_name(item: TokenStream) -> Term {
+    let mut tokens = item.into_iter();
+    while let Some(tok) = tokens.next() {
+        if let TokenNode::Term(word) = tok.kind {
+            if word.as_str() == "fn" {
+                break
+            }
+        }
+    }
+
+    match tokens.next().map(|t| t.kind) {
+        Some(TokenNode::Term(word)) => word,
+        _ => panic!("failed to find function name"),
+    }
+}
@@ -1,4 +1,4 @@
-//! Runtime support needed for the `#![assert_instr]` macro
+//! Runtime support needed for testing the stdsimd crate.
 //!
 //! This basically just disassembles the current executable and then parses the
 //! output once globally and then provides the `assert` function which makes
@@ -7,6 +7,7 @@
 #![feature(proc_macro)]

 extern crate assert_instr_macro;
+extern crate simd_test_macro;
 extern crate backtrace;
 extern crate cc;
 extern crate rustc_demangle;
@@ -19,6 +20,7 @@
 use std::str;

 pub use assert_instr_macro::*;
+pub use simd_test_macro::*;

 lazy_static! {
    static ref DISASSEMBLY: HashMap<String, Vec<Function>> = disassemble_myself();
@@ -0,0 +1,25 @@
+#![feature(cfg_target_feature)]
+
+#[macro_use]
+extern crate stdsimd;
+extern crate cupid;
+
+#[test]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fn works() {
+    let information = cupid::master().unwrap();
+    assert_eq!(cfg_feature_enabled!("sse"), information.sse());
+    assert_eq!(cfg_feature_enabled!("sse2"), information.sse2());
+    assert_eq!(cfg_feature_enabled!("sse3"), information.sse3());
+    assert_eq!(cfg_feature_enabled!("ssse3"), information.ssse3());
+    assert_eq!(cfg_feature_enabled!("sse4.1"), information.sse4_1());
+    assert_eq!(cfg_feature_enabled!("sse4.2"), information.sse4_2());
+    assert_eq!(cfg_feature_enabled!("avx"), information.avx());
+    assert_eq!(cfg_feature_enabled!("avx2"), information.avx2());
+    assert_eq!(cfg_feature_enabled!("fma"), information.fma());
+    assert_eq!(cfg_feature_enabled!("bmi"), information.bmi1());
+    assert_eq!(cfg_feature_enabled!("bmi2"), information.bmi2());
+    assert_eq!(cfg_feature_enabled!("popcnt"), information.popcnt());
+
+    // TODO: tbm, abm, lzcnt
+}