manually const-ify shuffle arguments (#1160)

2026-05-15 20:45:45 +03:00 · 2021-05-11 22:11:52 +02:00
parent 7516a80c31
commit a34883b5d3
17 changed files with 1655 additions and 1549 deletions
@@ -40,8 +40,8 @@ pub unsafe fn vabdq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uabdl))]
 pub unsafe fn vabdl_high_u8(a: uint8x16_t, b: uint8x16_t) -> uint16x8_t {
-    let c: uint8x8_t = simd_shuffle8(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
-    let d: uint8x8_t = simd_shuffle8(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let c: uint8x8_t = simd_shuffle8!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let d: uint8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
    simd_cast(vabd_u8(c, d))
 }

@@ -50,8 +50,8 @@ pub unsafe fn vabdl_high_u8(a: uint8x16_t, b: uint8x16_t) -> uint16x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uabdl))]
 pub unsafe fn vabdl_high_u16(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t {
-    let c: uint16x4_t = simd_shuffle4(a, a, [4, 5, 6, 7]);
-    let d: uint16x4_t = simd_shuffle4(b, b, [4, 5, 6, 7]);
+    let c: uint16x4_t = simd_shuffle4!(a, a, [4, 5, 6, 7]);
+    let d: uint16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
    simd_cast(vabd_u16(c, d))
 }

@@ -60,8 +60,8 @@ pub unsafe fn vabdl_high_u16(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uabdl))]
 pub unsafe fn vabdl_high_u32(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t {
-    let c: uint32x2_t = simd_shuffle2(a, a, [2, 3]);
-    let d: uint32x2_t = simd_shuffle2(b, b, [2, 3]);
+    let c: uint32x2_t = simd_shuffle2!(a, a, [2, 3]);
+    let d: uint32x2_t = simd_shuffle2!(b, b, [2, 3]);
    simd_cast(vabd_u32(c, d))
 }

@@ -70,8 +70,8 @@ pub unsafe fn vabdl_high_u32(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(sabdl))]
 pub unsafe fn vabdl_high_s8(a: int8x16_t, b: int8x16_t) -> int16x8_t {
-    let c: int8x8_t = simd_shuffle8(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
-    let d: int8x8_t = simd_shuffle8(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let c: int8x8_t = simd_shuffle8!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let d: int8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
    let e: uint8x8_t = simd_cast(vabd_s8(c, d));
    simd_cast(e)
 }
@@ -81,8 +81,8 @@ pub unsafe fn vabdl_high_s8(a: int8x16_t, b: int8x16_t) -> int16x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(sabdl))]
 pub unsafe fn vabdl_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t {
-    let c: int16x4_t = simd_shuffle4(a, a, [4, 5, 6, 7]);
-    let d: int16x4_t = simd_shuffle4(b, b, [4, 5, 6, 7]);
+    let c: int16x4_t = simd_shuffle4!(a, a, [4, 5, 6, 7]);
+    let d: int16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
    let e: uint16x4_t = simd_cast(vabd_s16(c, d));
    simd_cast(e)
 }
@@ -92,8 +92,8 @@ pub unsafe fn vabdl_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(sabdl))]
 pub unsafe fn vabdl_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t {
-    let c: int32x2_t = simd_shuffle2(a, a, [2, 3]);
-    let d: int32x2_t = simd_shuffle2(b, b, [2, 3]);
+    let c: int32x2_t = simd_shuffle2!(a, a, [2, 3]);
+    let d: int32x2_t = simd_shuffle2!(b, b, [2, 3]);
    let e: uint32x2_t = simd_cast(vabd_s32(c, d));
    simd_cast(e)
 }
@@ -1171,14 +1171,14 @@ pub unsafe fn vcopy_lane_s8<const LANE1: i32, const LANE2: i32>(a: int8x8_t, b:
    static_assert_imm3!(LANE1);
    static_assert_imm3!(LANE2);
    match LANE1 & 0b111 {
-        0 => simd_shuffle8(a, b, [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
-        1 => simd_shuffle8(a, b, [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
-        2 => simd_shuffle8(a, b, [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]),
-        3 => simd_shuffle8(a, b, [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]),
-        4 => simd_shuffle8(a, b, [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]),
-        5 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]),
-        6 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]),
-        7 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]),
+        0 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
+        1 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
+        2 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]),
+        3 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]),
+        4 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]),
+        5 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]),
+        6 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]),
+        7 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]),
        _ => unreachable_unchecked(),
    }
 }
@@ -1192,22 +1192,22 @@ pub unsafe fn vcopyq_laneq_s8<const LANE1: i32, const LANE2: i32>(a: int8x16_t,
    static_assert_imm4!(LANE1);
    static_assert_imm4!(LANE2);
    match LANE1 & 0b1111 {
-        0 => simd_shuffle16(a, b, [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
-        1 => simd_shuffle16(a, b, [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
-        2 => simd_shuffle16(a, b, [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
-        3 => simd_shuffle16(a, b, [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
-        4 => simd_shuffle16(a, b, [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
-        5 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
-        6 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
-        7 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32, 8, 9, 10, 11, 12, 13, 14, 15]),
-        8 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 16 + LANE2 as u32, 9, 10, 11, 12, 13, 14, 15]),
-        9 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 16 + LANE2 as u32, 10, 11, 12, 13, 14, 15]),
-        10 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16 + LANE2 as u32, 11, 12, 13, 14, 15]),
-        11 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16 + LANE2 as u32, 12, 13, 14, 15]),
-        12 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16 + LANE2 as u32, 13, 14, 15]),
-        13 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16 + LANE2 as u32, 14, 15]),
-        14 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16 + LANE2 as u32, 15]),
-        15 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16 + LANE2 as u32]),
+        0 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        1 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        2 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        3 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        4 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        5 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        6 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        7 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32, 8, 9, 10, 11, 12, 13, 14, 15]),
+        8 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 16 + LANE2 as u32, 9, 10, 11, 12, 13, 14, 15]),
+        9 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 16 + LANE2 as u32, 10, 11, 12, 13, 14, 15]),
+        10 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16 + LANE2 as u32, 11, 12, 13, 14, 15]),
+        11 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16 + LANE2 as u32, 12, 13, 14, 15]),
+        12 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16 + LANE2 as u32, 13, 14, 15]),
+        13 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16 + LANE2 as u32, 14, 15]),
+        14 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16 + LANE2 as u32, 15]),
+        15 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16 + LANE2 as u32]),
        _ => unreachable_unchecked(),
    }
 }
@@ -1221,10 +1221,10 @@ pub unsafe fn vcopy_lane_s16<const LANE1: i32, const LANE2: i32>(a: int16x4_t, b
    static_assert_imm2!(LANE1);
    static_assert_imm2!(LANE2);
    match LANE1 & 0b11 {
-        0 => simd_shuffle4(a, b, [4 + LANE2 as u32, 1, 2, 3]),
-        1 => simd_shuffle4(a, b, [0, 4 + LANE2 as u32, 2, 3]),
-        2 => simd_shuffle4(a, b, [0, 1, 4 + LANE2 as u32, 3]),
-        3 => simd_shuffle4(a, b, [0, 1, 2, 4 + LANE2 as u32]),
+        0 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [4 + LANE2 as u32, 1, 2, 3]),
+        1 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 4 + LANE2 as u32, 2, 3]),
+        2 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 4 + LANE2 as u32, 3]),
+        3 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 4 + LANE2 as u32]),
        _ => unreachable_unchecked(),
    }
 }
@@ -1238,14 +1238,14 @@ pub unsafe fn vcopyq_laneq_s16<const LANE1: i32, const LANE2: i32>(a: int16x8_t,
    static_assert_imm3!(LANE1);
    static_assert_imm3!(LANE2);
    match LANE1 & 0b111 {
-        0 => simd_shuffle8(a, b, [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
-        1 => simd_shuffle8(a, b, [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
-        2 => simd_shuffle8(a, b, [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]),
-        3 => simd_shuffle8(a, b, [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]),
-        4 => simd_shuffle8(a, b, [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]),
-        5 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]),
-        6 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]),
-        7 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]),
+        0 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
+        1 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
+        2 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]),
+        3 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]),
+        4 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]),
+        5 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]),
+        6 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]),
+        7 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]),
        _ => unreachable_unchecked(),
    }
 }
@@ -1259,8 +1259,8 @@ pub unsafe fn vcopy_lane_s32<const LANE1: i32, const LANE2: i32>(a: int32x2_t, b
    static_assert_imm1!(LANE1);
    static_assert_imm1!(LANE2);
    match LANE1 & 0b1 {
-        0 => simd_shuffle2(a, b, [2 + LANE2 as u32, 1]),
-        1 => simd_shuffle2(a, b, [0, 2 + LANE2 as u32]),
+        0 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [2 + LANE2 as u32, 1]),
+        1 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [0, 2 + LANE2 as u32]),
        _ => unreachable_unchecked(),
    }
 }
@@ -1274,10 +1274,10 @@ pub unsafe fn vcopyq_laneq_s32<const LANE1: i32, const LANE2: i32>(a: int32x4_t,
    static_assert_imm2!(LANE1);
    static_assert_imm2!(LANE2);
    match LANE1 & 0b11 {
-        0 => simd_shuffle4(a, b, [4 + LANE2 as u32, 1, 2, 3]),
-        1 => simd_shuffle4(a, b, [0, 4 + LANE2 as u32, 2, 3]),
-        2 => simd_shuffle4(a, b, [0, 1, 4 + LANE2 as u32, 3]),
-        3 => simd_shuffle4(a, b, [0, 1, 2, 4 + LANE2 as u32]),
+        0 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [4 + LANE2 as u32, 1, 2, 3]),
+        1 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 4 + LANE2 as u32, 2, 3]),
+        2 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 4 + LANE2 as u32, 3]),
+        3 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 4 + LANE2 as u32]),
        _ => unreachable_unchecked(),
    }
 }
@@ -1291,8 +1291,8 @@ pub unsafe fn vcopyq_laneq_s64<const LANE1: i32, const LANE2: i32>(a: int64x2_t,
    static_assert_imm1!(LANE1);
    static_assert_imm1!(LANE2);
    match LANE1 & 0b1 {
-        0 => simd_shuffle2(a, b, [2 + LANE2 as u32, 1]),
-        1 => simd_shuffle2(a, b, [0, 2 + LANE2 as u32]),
+        0 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [2 + LANE2 as u32, 1]),
+        1 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [0, 2 + LANE2 as u32]),
        _ => unreachable_unchecked(),
    }
 }
@@ -1306,14 +1306,14 @@ pub unsafe fn vcopy_lane_u8<const LANE1: i32, const LANE2: i32>(a: uint8x8_t, b:
    static_assert_imm3!(LANE1);
    static_assert_imm3!(LANE2);
    match LANE1 & 0b111 {
-        0 => simd_shuffle8(a, b, [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
-        1 => simd_shuffle8(a, b, [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
-        2 => simd_shuffle8(a, b, [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]),
-        3 => simd_shuffle8(a, b, [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]),
-        4 => simd_shuffle8(a, b, [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]),
-        5 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]),
-        6 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]),
-        7 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]),
+        0 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
+        1 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
+        2 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]),
+        3 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]),
+        4 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]),
+        5 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]),
+        6 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]),
+        7 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]),
        _ => unreachable_unchecked(),
    }
 }
@@ -1327,22 +1327,22 @@ pub unsafe fn vcopyq_laneq_u8<const LANE1: i32, const LANE2: i32>(a: uint8x16_t,
    static_assert_imm4!(LANE1);
    static_assert_imm4!(LANE2);
    match LANE1 & 0b1111 {
-        0 => simd_shuffle16(a, b, [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
-        1 => simd_shuffle16(a, b, [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
-        2 => simd_shuffle16(a, b, [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
-        3 => simd_shuffle16(a, b, [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
-        4 => simd_shuffle16(a, b, [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
-        5 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
-        6 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
-        7 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32, 8, 9, 10, 11, 12, 13, 14, 15]),
-        8 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 16 + LANE2 as u32, 9, 10, 11, 12, 13, 14, 15]),
-        9 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 16 + LANE2 as u32, 10, 11, 12, 13, 14, 15]),
-        10 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16 + LANE2 as u32, 11, 12, 13, 14, 15]),
-        11 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16 + LANE2 as u32, 12, 13, 14, 15]),
-        12 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16 + LANE2 as u32, 13, 14, 15]),
-        13 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16 + LANE2 as u32, 14, 15]),
-        14 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16 + LANE2 as u32, 15]),
-        15 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16 + LANE2 as u32]),
+        0 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        1 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        2 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        3 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        4 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        5 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        6 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        7 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32, 8, 9, 10, 11, 12, 13, 14, 15]),
+        8 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 16 + LANE2 as u32, 9, 10, 11, 12, 13, 14, 15]),
+        9 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 16 + LANE2 as u32, 10, 11, 12, 13, 14, 15]),
+        10 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16 + LANE2 as u32, 11, 12, 13, 14, 15]),
+        11 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16 + LANE2 as u32, 12, 13, 14, 15]),
+        12 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16 + LANE2 as u32, 13, 14, 15]),
+        13 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16 + LANE2 as u32, 14, 15]),
+        14 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16 + LANE2 as u32, 15]),
+        15 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16 + LANE2 as u32]),
        _ => unreachable_unchecked(),
    }
 }
@@ -1356,10 +1356,10 @@ pub unsafe fn vcopy_lane_u16<const LANE1: i32, const LANE2: i32>(a: uint16x4_t,
    static_assert_imm2!(LANE1);
    static_assert_imm2!(LANE2);
    match LANE1 & 0b11 {
-        0 => simd_shuffle4(a, b, [4 + LANE2 as u32, 1, 2, 3]),
-        1 => simd_shuffle4(a, b, [0, 4 + LANE2 as u32, 2, 3]),
-        2 => simd_shuffle4(a, b, [0, 1, 4 + LANE2 as u32, 3]),
-        3 => simd_shuffle4(a, b, [0, 1, 2, 4 + LANE2 as u32]),
+        0 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [4 + LANE2 as u32, 1, 2, 3]),
+        1 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 4 + LANE2 as u32, 2, 3]),
+        2 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 4 + LANE2 as u32, 3]),
+        3 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 4 + LANE2 as u32]),
        _ => unreachable_unchecked(),
    }
 }
@@ -1373,14 +1373,14 @@ pub unsafe fn vcopyq_laneq_u16<const LANE1: i32, const LANE2: i32>(a: uint16x8_t
    static_assert_imm3!(LANE1);
    static_assert_imm3!(LANE2);
    match LANE1 & 0b111 {
-        0 => simd_shuffle8(a, b, [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
-        1 => simd_shuffle8(a, b, [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
-        2 => simd_shuffle8(a, b, [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]),
-        3 => simd_shuffle8(a, b, [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]),
-        4 => simd_shuffle8(a, b, [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]),
-        5 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]),
-        6 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]),
-        7 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]),
+        0 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
+        1 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
+        2 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]),
+        3 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]),
+        4 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]),
+        5 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]),
+        6 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]),
+        7 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]),
        _ => unreachable_unchecked(),
    }
 }
@@ -1394,8 +1394,8 @@ pub unsafe fn vcopy_lane_u32<const LANE1: i32, const LANE2: i32>(a: uint32x2_t,
    static_assert_imm1!(LANE1);
    static_assert_imm1!(LANE2);
    match LANE1 & 0b1 {
-        0 => simd_shuffle2(a, b, [2 + LANE2 as u32, 1]),
-        1 => simd_shuffle2(a, b, [0, 2 + LANE2 as u32]),
+        0 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [2 + LANE2 as u32, 1]),
+        1 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [0, 2 + LANE2 as u32]),
        _ => unreachable_unchecked(),
    }
 }
@@ -1409,10 +1409,10 @@ pub unsafe fn vcopyq_laneq_u32<const LANE1: i32, const LANE2: i32>(a: uint32x4_t
    static_assert_imm2!(LANE1);
    static_assert_imm2!(LANE2);
    match LANE1 & 0b11 {
-        0 => simd_shuffle4(a, b, [4 + LANE2 as u32, 1, 2, 3]),
-        1 => simd_shuffle4(a, b, [0, 4 + LANE2 as u32, 2, 3]),
-        2 => simd_shuffle4(a, b, [0, 1, 4 + LANE2 as u32, 3]),
-        3 => simd_shuffle4(a, b, [0, 1, 2, 4 + LANE2 as u32]),
+        0 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [4 + LANE2 as u32, 1, 2, 3]),
+        1 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 4 + LANE2 as u32, 2, 3]),
+        2 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 4 + LANE2 as u32, 3]),
+        3 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 4 + LANE2 as u32]),
        _ => unreachable_unchecked(),
    }
 }
@@ -1426,8 +1426,8 @@ pub unsafe fn vcopyq_laneq_u64<const LANE1: i32, const LANE2: i32>(a: uint64x2_t
    static_assert_imm1!(LANE1);
    static_assert_imm1!(LANE2);
    match LANE1 & 0b1 {
-        0 => simd_shuffle2(a, b, [2 + LANE2 as u32, 1]),
-        1 => simd_shuffle2(a, b, [0, 2 + LANE2 as u32]),
+        0 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [2 + LANE2 as u32, 1]),
+        1 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [0, 2 + LANE2 as u32]),
        _ => unreachable_unchecked(),
    }
 }
@@ -1441,14 +1441,14 @@ pub unsafe fn vcopy_lane_p8<const LANE1: i32, const LANE2: i32>(a: poly8x8_t, b:
    static_assert_imm3!(LANE1);
    static_assert_imm3!(LANE2);
    match LANE1 & 0b111 {
-        0 => simd_shuffle8(a, b, [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
-        1 => simd_shuffle8(a, b, [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
-        2 => simd_shuffle8(a, b, [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]),
-        3 => simd_shuffle8(a, b, [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]),
-        4 => simd_shuffle8(a, b, [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]),
-        5 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]),
-        6 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]),
-        7 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]),
+        0 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
+        1 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
+        2 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]),
+        3 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]),
+        4 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]),
+        5 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]),
+        6 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]),
+        7 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]),
        _ => unreachable_unchecked(),
    }
 }
@@ -1462,22 +1462,22 @@ pub unsafe fn vcopyq_laneq_p8<const LANE1: i32, const LANE2: i32>(a: poly8x16_t,
    static_assert_imm4!(LANE1);
    static_assert_imm4!(LANE2);
    match LANE1 & 0b1111 {
-        0 => simd_shuffle16(a, b, [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
-        1 => simd_shuffle16(a, b, [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
-        2 => simd_shuffle16(a, b, [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
-        3 => simd_shuffle16(a, b, [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
-        4 => simd_shuffle16(a, b, [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
-        5 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
-        6 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
-        7 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32, 8, 9, 10, 11, 12, 13, 14, 15]),
-        8 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 16 + LANE2 as u32, 9, 10, 11, 12, 13, 14, 15]),
-        9 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 16 + LANE2 as u32, 10, 11, 12, 13, 14, 15]),
-        10 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16 + LANE2 as u32, 11, 12, 13, 14, 15]),
-        11 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16 + LANE2 as u32, 12, 13, 14, 15]),
-        12 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16 + LANE2 as u32, 13, 14, 15]),
-        13 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16 + LANE2 as u32, 14, 15]),
-        14 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16 + LANE2 as u32, 15]),
-        15 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16 + LANE2 as u32]),
+        0 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        1 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        2 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        3 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        4 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        5 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        6 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        7 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32, 8, 9, 10, 11, 12, 13, 14, 15]),
+        8 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 16 + LANE2 as u32, 9, 10, 11, 12, 13, 14, 15]),
+        9 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 16 + LANE2 as u32, 10, 11, 12, 13, 14, 15]),
+        10 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16 + LANE2 as u32, 11, 12, 13, 14, 15]),
+        11 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16 + LANE2 as u32, 12, 13, 14, 15]),
+        12 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16 + LANE2 as u32, 13, 14, 15]),
+        13 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16 + LANE2 as u32, 14, 15]),
+        14 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16 + LANE2 as u32, 15]),
+        15 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16 + LANE2 as u32]),
        _ => unreachable_unchecked(),
    }
 }
@@ -1491,10 +1491,10 @@ pub unsafe fn vcopy_lane_p16<const LANE1: i32, const LANE2: i32>(a: poly16x4_t,
    static_assert_imm2!(LANE1);
    static_assert_imm2!(LANE2);
    match LANE1 & 0b11 {
-        0 => simd_shuffle4(a, b, [4 + LANE2 as u32, 1, 2, 3]),
-        1 => simd_shuffle4(a, b, [0, 4 + LANE2 as u32, 2, 3]),
-        2 => simd_shuffle4(a, b, [0, 1, 4 + LANE2 as u32, 3]),
-        3 => simd_shuffle4(a, b, [0, 1, 2, 4 + LANE2 as u32]),
+        0 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [4 + LANE2 as u32, 1, 2, 3]),
+        1 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 4 + LANE2 as u32, 2, 3]),
+        2 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 4 + LANE2 as u32, 3]),
+        3 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 4 + LANE2 as u32]),
        _ => unreachable_unchecked(),
    }
 }
@@ -1508,14 +1508,14 @@ pub unsafe fn vcopyq_laneq_p16<const LANE1: i32, const LANE2: i32>(a: poly16x8_t
    static_assert_imm3!(LANE1);
    static_assert_imm3!(LANE2);
    match LANE1 & 0b111 {
-        0 => simd_shuffle8(a, b, [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
-        1 => simd_shuffle8(a, b, [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
-        2 => simd_shuffle8(a, b, [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]),
-        3 => simd_shuffle8(a, b, [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]),
-        4 => simd_shuffle8(a, b, [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]),
-        5 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]),
-        6 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]),
-        7 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]),
+        0 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
+        1 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
+        2 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]),
+        3 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]),
+        4 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]),
+        5 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]),
+        6 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]),
+        7 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]),
        _ => unreachable_unchecked(),
    }
 }
@@ -1529,8 +1529,8 @@ pub unsafe fn vcopyq_laneq_p64<const LANE1: i32, const LANE2: i32>(a: poly64x2_t
    static_assert_imm1!(LANE1);
    static_assert_imm1!(LANE2);
    match LANE1 & 0b1 {
-        0 => simd_shuffle2(a, b, [2 + LANE2 as u32, 1]),
-        1 => simd_shuffle2(a, b, [0, 2 + LANE2 as u32]),
+        0 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [2 + LANE2 as u32, 1]),
+        1 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [0, 2 + LANE2 as u32]),
        _ => unreachable_unchecked(),
    }
 }
@@ -1544,8 +1544,8 @@ pub unsafe fn vcopy_lane_f32<const LANE1: i32, const LANE2: i32>(a: float32x2_t,
    static_assert_imm1!(LANE1);
    static_assert_imm1!(LANE2);
    match LANE1 & 0b1 {
-        0 => simd_shuffle2(a, b, [2 + LANE2 as u32, 1]),
-        1 => simd_shuffle2(a, b, [0, 2 + LANE2 as u32]),
+        0 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [2 + LANE2 as u32, 1]),
+        1 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [0, 2 + LANE2 as u32]),
        _ => unreachable_unchecked(),
    }
 }
@@ -1559,10 +1559,10 @@ pub unsafe fn vcopyq_laneq_f32<const LANE1: i32, const LANE2: i32>(a: float32x4_
    static_assert_imm2!(LANE1);
    static_assert_imm2!(LANE2);
    match LANE1 & 0b11 {
-        0 => simd_shuffle4(a, b, [4 + LANE2 as u32, 1, 2, 3]),
-        1 => simd_shuffle4(a, b, [0, 4 + LANE2 as u32, 2, 3]),
-        2 => simd_shuffle4(a, b, [0, 1, 4 + LANE2 as u32, 3]),
-        3 => simd_shuffle4(a, b, [0, 1, 2, 4 + LANE2 as u32]),
+        0 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [4 + LANE2 as u32, 1, 2, 3]),
+        1 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 4 + LANE2 as u32, 2, 3]),
+        2 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 4 + LANE2 as u32, 3]),
+        3 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 4 + LANE2 as u32]),
        _ => unreachable_unchecked(),
    }
 }
@@ -1576,8 +1576,8 @@ pub unsafe fn vcopyq_laneq_f64<const LANE1: i32, const LANE2: i32>(a: float64x2_
    static_assert_imm1!(LANE1);
    static_assert_imm1!(LANE2);
    match LANE1 & 0b1 {
-        0 => simd_shuffle2(a, b, [2 + LANE2 as u32, 1]),
-        1 => simd_shuffle2(a, b, [0, 2 + LANE2 as u32]),
+        0 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [2 + LANE2 as u32, 1]),
+        1 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [0, 2 + LANE2 as u32]),
        _ => unreachable_unchecked(),
    }
 }
@@ -1590,16 +1590,16 @@ pub unsafe fn vcopyq_laneq_f64<const LANE1: i32, const LANE2: i32>(a: float64x2_
 pub unsafe fn vcopy_laneq_s8<const LANE1: i32, const LANE2: i32>(a: int8x8_t, b: int8x16_t) -> int8x8_t {
    static_assert_imm3!(LANE1);
    static_assert_imm4!(LANE2);
-    let a: int8x16_t = simd_shuffle16(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
+    let a: int8x16_t = simd_shuffle16!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
    match LANE1 & 0b111 {
-        0 => simd_shuffle8(a, b, [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
-        1 => simd_shuffle8(a, b, [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
-        2 => simd_shuffle8(a, b, [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7]),
-        3 => simd_shuffle8(a, b, [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7]),
-        4 => simd_shuffle8(a, b, [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7]),
-        5 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7]),
-        6 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7]),
-        7 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32]),
+        0 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
+        1 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
+        2 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7]),
+        3 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7]),
+        4 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7]),
+        5 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7]),
+        6 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7]),
+        7 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32]),
        _ => unreachable_unchecked(),
    }
 }
@@ -1612,12 +1612,12 @@ pub unsafe fn vcopy_laneq_s8<const LANE1: i32, const LANE2: i32>(a: int8x8_t, b:
 pub unsafe fn vcopy_laneq_s16<const LANE1: i32, const LANE2: i32>(a: int16x4_t, b: int16x8_t) -> int16x4_t {
    static_assert_imm2!(LANE1);
    static_assert_imm3!(LANE2);
-    let a: int16x8_t = simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+    let a: int16x8_t = simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
    match LANE1 & 0b11 {
-        0 => simd_shuffle4(a, b, [8 + LANE2 as u32, 1, 2, 3]),
-        1 => simd_shuffle4(a, b, [0, 8 + LANE2 as u32, 2, 3]),
-        2 => simd_shuffle4(a, b, [0, 1, 8 + LANE2 as u32, 3]),
-        3 => simd_shuffle4(a, b, [0, 1, 2, 8 + LANE2 as u32]),
+        0 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [8 + LANE2 as u32, 1, 2, 3]),
+        1 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 8 + LANE2 as u32, 2, 3]),
+        2 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 8 + LANE2 as u32, 3]),
+        3 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 8 + LANE2 as u32]),
        _ => unreachable_unchecked(),
    }
 }
@@ -1630,10 +1630,10 @@ pub unsafe fn vcopy_laneq_s16<const LANE1: i32, const LANE2: i32>(a: int16x4_t,
 pub unsafe fn vcopy_laneq_s32<const LANE1: i32, const LANE2: i32>(a: int32x2_t, b: int32x4_t) -> int32x2_t {
    static_assert_imm1!(LANE1);
    static_assert_imm2!(LANE2);
-    let a: int32x4_t = simd_shuffle4(a, a, [0, 1, 2, 3]);
+    let a: int32x4_t = simd_shuffle4!(a, a, [0, 1, 2, 3]);
    match LANE1 & 0b1 {
-        0 => simd_shuffle2(a, b, [4 + LANE2 as u32, 1]),
-        1 => simd_shuffle2(a, b, [0, 4 + LANE2 as u32]),
+        0 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [4 + LANE2 as u32, 1]),
+        1 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [0, 4 + LANE2 as u32]),
        _ => unreachable_unchecked(),
    }
 }
@@ -1646,16 +1646,16 @@ pub unsafe fn vcopy_laneq_s32<const LANE1: i32, const LANE2: i32>(a: int32x2_t,
 pub unsafe fn vcopy_laneq_u8<const LANE1: i32, const LANE2: i32>(a: uint8x8_t, b: uint8x16_t) -> uint8x8_t {
    static_assert_imm3!(LANE1);
    static_assert_imm4!(LANE2);
-    let a: uint8x16_t = simd_shuffle16(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
+    let a: uint8x16_t = simd_shuffle16!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
    match LANE1 & 0b111 {
-        0 => simd_shuffle8(a, b, [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
-        1 => simd_shuffle8(a, b, [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
-        2 => simd_shuffle8(a, b, [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7]),
-        3 => simd_shuffle8(a, b, [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7]),
-        4 => simd_shuffle8(a, b, [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7]),
-        5 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7]),
-        6 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7]),
-        7 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32]),
+        0 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
+        1 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
+        2 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7]),
+        3 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7]),
+        4 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7]),
+        5 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7]),
+        6 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7]),
+        7 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32]),
        _ => unreachable_unchecked(),
    }
 }
@@ -1668,12 +1668,12 @@ pub unsafe fn vcopy_laneq_u8<const LANE1: i32, const LANE2: i32>(a: uint8x8_t, b
 pub unsafe fn vcopy_laneq_u16<const LANE1: i32, const LANE2: i32>(a: uint16x4_t, b: uint16x8_t) -> uint16x4_t {
    static_assert_imm2!(LANE1);
    static_assert_imm3!(LANE2);
-    let a: uint16x8_t = simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+    let a: uint16x8_t = simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
    match LANE1 & 0b11 {
-        0 => simd_shuffle4(a, b, [8 + LANE2 as u32, 1, 2, 3]),
-        1 => simd_shuffle4(a, b, [0, 8 + LANE2 as u32, 2, 3]),
-        2 => simd_shuffle4(a, b, [0, 1, 8 + LANE2 as u32, 3]),
-        3 => simd_shuffle4(a, b, [0, 1, 2, 8 + LANE2 as u32]),
+        0 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [8 + LANE2 as u32, 1, 2, 3]),
+        1 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 8 + LANE2 as u32, 2, 3]),
+        2 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 8 + LANE2 as u32, 3]),
+        3 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 8 + LANE2 as u32]),
        _ => unreachable_unchecked(),
    }
 }
@@ -1686,10 +1686,10 @@ pub unsafe fn vcopy_laneq_u16<const LANE1: i32, const LANE2: i32>(a: uint16x4_t,
 pub unsafe fn vcopy_laneq_u32<const LANE1: i32, const LANE2: i32>(a: uint32x2_t, b: uint32x4_t) -> uint32x2_t {
    static_assert_imm1!(LANE1);
    static_assert_imm2!(LANE2);
-    let a: uint32x4_t = simd_shuffle4(a, a, [0, 1, 2, 3]);
+    let a: uint32x4_t = simd_shuffle4!(a, a, [0, 1, 2, 3]);
    match LANE1 & 0b1 {
-        0 => simd_shuffle2(a, b, [4 + LANE2 as u32, 1]),
-        1 => simd_shuffle2(a, b, [0, 4 + LANE2 as u32]),
+        0 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [4 + LANE2 as u32, 1]),
+        1 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [0, 4 + LANE2 as u32]),
        _ => unreachable_unchecked(),
    }
 }
@@ -1702,16 +1702,16 @@ pub unsafe fn vcopy_laneq_u32<const LANE1: i32, const LANE2: i32>(a: uint32x2_t,
 pub unsafe fn vcopy_laneq_p8<const LANE1: i32, const LANE2: i32>(a: poly8x8_t, b: poly8x16_t) -> poly8x8_t {
    static_assert_imm3!(LANE1);
    static_assert_imm4!(LANE2);
-    let a: poly8x16_t = simd_shuffle16(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
+    let a: poly8x16_t = simd_shuffle16!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
    match LANE1 & 0b111 {
-        0 => simd_shuffle8(a, b, [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
-        1 => simd_shuffle8(a, b, [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
-        2 => simd_shuffle8(a, b, [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7]),
-        3 => simd_shuffle8(a, b, [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7]),
-        4 => simd_shuffle8(a, b, [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7]),
-        5 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7]),
-        6 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7]),
-        7 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32]),
+        0 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
+        1 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
+        2 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7]),
+        3 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7]),
+        4 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7]),
+        5 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7]),
+        6 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7]),
+        7 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32]),
        _ => unreachable_unchecked(),
    }
 }
@@ -1724,12 +1724,12 @@ pub unsafe fn vcopy_laneq_p8<const LANE1: i32, const LANE2: i32>(a: poly8x8_t, b
 pub unsafe fn vcopy_laneq_p16<const LANE1: i32, const LANE2: i32>(a: poly16x4_t, b: poly16x8_t) -> poly16x4_t {
    static_assert_imm2!(LANE1);
    static_assert_imm3!(LANE2);
-    let a: poly16x8_t = simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+    let a: poly16x8_t = simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
    match LANE1 & 0b11 {
-        0 => simd_shuffle4(a, b, [8 + LANE2 as u32, 1, 2, 3]),
-        1 => simd_shuffle4(a, b, [0, 8 + LANE2 as u32, 2, 3]),
-        2 => simd_shuffle4(a, b, [0, 1, 8 + LANE2 as u32, 3]),
-        3 => simd_shuffle4(a, b, [0, 1, 2, 8 + LANE2 as u32]),
+        0 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [8 + LANE2 as u32, 1, 2, 3]),
+        1 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 8 + LANE2 as u32, 2, 3]),
+        2 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 8 + LANE2 as u32, 3]),
+        3 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 8 + LANE2 as u32]),
        _ => unreachable_unchecked(),
    }
 }
@@ -1742,10 +1742,10 @@ pub unsafe fn vcopy_laneq_p16<const LANE1: i32, const LANE2: i32>(a: poly16x4_t,
 pub unsafe fn vcopy_laneq_f32<const LANE1: i32, const LANE2: i32>(a: float32x2_t, b: float32x4_t) -> float32x2_t {
    static_assert_imm1!(LANE1);
    static_assert_imm2!(LANE2);
-    let a: float32x4_t = simd_shuffle4(a, a, [0, 1, 2, 3]);
+    let a: float32x4_t = simd_shuffle4!(a, a, [0, 1, 2, 3]);
    match LANE1 & 0b1 {
-        0 => simd_shuffle2(a, b, [4 + LANE2 as u32, 1]),
-        1 => simd_shuffle2(a, b, [0, 4 + LANE2 as u32]),
+        0 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [4 + LANE2 as u32, 1]),
+        1 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [0, 4 + LANE2 as u32]),
        _ => unreachable_unchecked(),
    }
 }
@@ -1758,24 +1758,24 @@ pub unsafe fn vcopy_laneq_f32<const LANE1: i32, const LANE2: i32>(a: float32x2_t
 pub unsafe fn vcopyq_lane_s8<const LANE1: i32, const LANE2: i32>(a: int8x16_t, b: int8x8_t) -> int8x16_t {
    static_assert_imm4!(LANE1);
    static_assert_imm3!(LANE2);
-    let b: int8x16_t = simd_shuffle16(b, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
+    let b: int8x16_t = simd_shuffle16!(b, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
    match LANE1 & 0b1111 {
-        0 => simd_shuffle16(a, b, [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
-        1 => simd_shuffle16(a, b, [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
-        2 => simd_shuffle16(a, b, [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
-        3 => simd_shuffle16(a, b, [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
-        4 => simd_shuffle16(a, b, [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
-        5 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
-        6 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
-        7 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32, 8, 9, 10, 11, 12, 13, 14, 15]),
-        8 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 16 + LANE2 as u32, 9, 10, 11, 12, 13, 14, 15]),
-        9 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 16 + LANE2 as u32, 10, 11, 12, 13, 14, 15]),
-        10 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16 + LANE2 as u32, 11, 12, 13, 14, 15]),
-        11 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16 + LANE2 as u32, 12, 13, 14, 15]),
-        12 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16 + LANE2 as u32, 13, 14, 15]),
-        13 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16 + LANE2 as u32, 14, 15]),
-        14 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16 + LANE2 as u32, 15]),
-        15 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16 + LANE2 as u32]),
+        0 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        1 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        2 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        3 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        4 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        5 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        6 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        7 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32, 8, 9, 10, 11, 12, 13, 14, 15]),
+        8 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 16 + LANE2 as u32, 9, 10, 11, 12, 13, 14, 15]),
+        9 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 16 + LANE2 as u32, 10, 11, 12, 13, 14, 15]),
+        10 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16 + LANE2 as u32, 11, 12, 13, 14, 15]),
+        11 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16 + LANE2 as u32, 12, 13, 14, 15]),
+        12 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16 + LANE2 as u32, 13, 14, 15]),
+        13 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16 + LANE2 as u32, 14, 15]),
+        14 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16 + LANE2 as u32, 15]),
+        15 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16 + LANE2 as u32]),
        _ => unreachable_unchecked(),
    }
 }
@@ -1788,16 +1788,16 @@ pub unsafe fn vcopyq_lane_s8<const LANE1: i32, const LANE2: i32>(a: int8x16_t, b
 pub unsafe fn vcopyq_lane_s16<const LANE1: i32, const LANE2: i32>(a: int16x8_t, b: int16x4_t) -> int16x8_t {
    static_assert_imm3!(LANE1);
    static_assert_imm2!(LANE2);
-    let b: int16x8_t = simd_shuffle8(b, b, [0, 1, 2, 3, 4, 5, 6, 7]);
+    let b: int16x8_t = simd_shuffle8!(b, b, [0, 1, 2, 3, 4, 5, 6, 7]);
    match LANE1 & 0b111 {
-        0 => simd_shuffle8(a, b, [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
-        1 => simd_shuffle8(a, b, [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
-        2 => simd_shuffle8(a, b, [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]),
-        3 => simd_shuffle8(a, b, [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]),
-        4 => simd_shuffle8(a, b, [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]),
-        5 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]),
-        6 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]),
-        7 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]),
+        0 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
+        1 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
+        2 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]),
+        3 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]),
+        4 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]),
+        5 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]),
+        6 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]),
+        7 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]),
        _ => unreachable_unchecked(),
    }
 }
@@ -1810,12 +1810,12 @@ pub unsafe fn vcopyq_lane_s16<const LANE1: i32, const LANE2: i32>(a: int16x8_t,
 pub unsafe fn vcopyq_lane_s32<const LANE1: i32, const LANE2: i32>(a: int32x4_t, b: int32x2_t) -> int32x4_t {
    static_assert_imm2!(LANE1);
    static_assert_imm1!(LANE2);
-    let b: int32x4_t = simd_shuffle4(b, b, [0, 1, 2, 3]);
+    let b: int32x4_t = simd_shuffle4!(b, b, [0, 1, 2, 3]);
    match LANE1 & 0b11 {
-        0 => simd_shuffle4(a, b, [4 + LANE2 as u32, 1, 2, 3]),
-        1 => simd_shuffle4(a, b, [0, 4 + LANE2 as u32, 2, 3]),
-        2 => simd_shuffle4(a, b, [0, 1, 4 + LANE2 as u32, 3]),
-        3 => simd_shuffle4(a, b, [0, 1, 2, 4 + LANE2 as u32]),
+        0 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [4 + LANE2 as u32, 1, 2, 3]),
+        1 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 4 + LANE2 as u32, 2, 3]),
+        2 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 4 + LANE2 as u32, 3]),
+        3 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 4 + LANE2 as u32]),
        _ => unreachable_unchecked(),
    }
 }
@@ -1828,24 +1828,24 @@ pub unsafe fn vcopyq_lane_s32<const LANE1: i32, const LANE2: i32>(a: int32x4_t,
 pub unsafe fn vcopyq_lane_u8<const LANE1: i32, const LANE2: i32>(a: uint8x16_t, b: uint8x8_t) -> uint8x16_t {
    static_assert_imm4!(LANE1);
    static_assert_imm3!(LANE2);
-    let b: uint8x16_t = simd_shuffle16(b, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
+    let b: uint8x16_t = simd_shuffle16!(b, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
    match LANE1 & 0b1111 {
-        0 => simd_shuffle16(a, b, [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
-        1 => simd_shuffle16(a, b, [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
-        2 => simd_shuffle16(a, b, [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
-        3 => simd_shuffle16(a, b, [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
-        4 => simd_shuffle16(a, b, [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
-        5 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
-        6 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
-        7 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32, 8, 9, 10, 11, 12, 13, 14, 15]),
-        8 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 16 + LANE2 as u32, 9, 10, 11, 12, 13, 14, 15]),
-        9 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 16 + LANE2 as u32, 10, 11, 12, 13, 14, 15]),
-        10 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16 + LANE2 as u32, 11, 12, 13, 14, 15]),
-        11 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16 + LANE2 as u32, 12, 13, 14, 15]),
-        12 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16 + LANE2 as u32, 13, 14, 15]),
-        13 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16 + LANE2 as u32, 14, 15]),
-        14 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16 + LANE2 as u32, 15]),
-        15 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16 + LANE2 as u32]),
+        0 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        1 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        2 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        3 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        4 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        5 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        6 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        7 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32, 8, 9, 10, 11, 12, 13, 14, 15]),
+        8 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 16 + LANE2 as u32, 9, 10, 11, 12, 13, 14, 15]),
+        9 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 16 + LANE2 as u32, 10, 11, 12, 13, 14, 15]),
+        10 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16 + LANE2 as u32, 11, 12, 13, 14, 15]),
+        11 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16 + LANE2 as u32, 12, 13, 14, 15]),
+        12 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16 + LANE2 as u32, 13, 14, 15]),
+        13 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16 + LANE2 as u32, 14, 15]),
+        14 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16 + LANE2 as u32, 15]),
+        15 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16 + LANE2 as u32]),
        _ => unreachable_unchecked(),
    }
 }
@@ -1858,16 +1858,16 @@ pub unsafe fn vcopyq_lane_u8<const LANE1: i32, const LANE2: i32>(a: uint8x16_t,
 pub unsafe fn vcopyq_lane_u16<const LANE1: i32, const LANE2: i32>(a: uint16x8_t, b: uint16x4_t) -> uint16x8_t {
    static_assert_imm3!(LANE1);
    static_assert_imm2!(LANE2);
-    let b: uint16x8_t = simd_shuffle8(b, b, [0, 1, 2, 3, 4, 5, 6, 7]);
+    let b: uint16x8_t = simd_shuffle8!(b, b, [0, 1, 2, 3, 4, 5, 6, 7]);
    match LANE1 & 0b111 {
-        0 => simd_shuffle8(a, b, [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
-        1 => simd_shuffle8(a, b, [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
-        2 => simd_shuffle8(a, b, [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]),
-        3 => simd_shuffle8(a, b, [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]),
-        4 => simd_shuffle8(a, b, [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]),
-        5 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]),
-        6 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]),
-        7 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]),
+        0 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
+        1 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
+        2 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]),
+        3 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]),
+        4 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]),
+        5 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]),
+        6 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]),
+        7 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]),
        _ => unreachable_unchecked(),
    }
 }
@@ -1880,12 +1880,12 @@ pub unsafe fn vcopyq_lane_u16<const LANE1: i32, const LANE2: i32>(a: uint16x8_t,
 pub unsafe fn vcopyq_lane_u32<const LANE1: i32, const LANE2: i32>(a: uint32x4_t, b: uint32x2_t) -> uint32x4_t {
    static_assert_imm2!(LANE1);
    static_assert_imm1!(LANE2);
-    let b: uint32x4_t = simd_shuffle4(b, b, [0, 1, 2, 3]);
+    let b: uint32x4_t = simd_shuffle4!(b, b, [0, 1, 2, 3]);
    match LANE1 & 0b11 {
-        0 => simd_shuffle4(a, b, [4 + LANE2 as u32, 1, 2, 3]),
-        1 => simd_shuffle4(a, b, [0, 4 + LANE2 as u32, 2, 3]),
-        2 => simd_shuffle4(a, b, [0, 1, 4 + LANE2 as u32, 3]),
-        3 => simd_shuffle4(a, b, [0, 1, 2, 4 + LANE2 as u32]),
+        0 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [4 + LANE2 as u32, 1, 2, 3]),
+        1 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 4 + LANE2 as u32, 2, 3]),
+        2 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 4 + LANE2 as u32, 3]),
+        3 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 4 + LANE2 as u32]),
        _ => unreachable_unchecked(),
    }
 }
@@ -1898,24 +1898,24 @@ pub unsafe fn vcopyq_lane_u32<const LANE1: i32, const LANE2: i32>(a: uint32x4_t,
 pub unsafe fn vcopyq_lane_p8<const LANE1: i32, const LANE2: i32>(a: poly8x16_t, b: poly8x8_t) -> poly8x16_t {
    static_assert_imm4!(LANE1);
    static_assert_imm3!(LANE2);
-    let b: poly8x16_t = simd_shuffle16(b, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
+    let b: poly8x16_t = simd_shuffle16!(b, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
    match LANE1 & 0b1111 {
-        0 => simd_shuffle16(a, b, [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
-        1 => simd_shuffle16(a, b, [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
-        2 => simd_shuffle16(a, b, [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
-        3 => simd_shuffle16(a, b, [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
-        4 => simd_shuffle16(a, b, [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
-        5 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
-        6 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
-        7 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32, 8, 9, 10, 11, 12, 13, 14, 15]),
-        8 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 16 + LANE2 as u32, 9, 10, 11, 12, 13, 14, 15]),
-        9 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 16 + LANE2 as u32, 10, 11, 12, 13, 14, 15]),
-        10 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16 + LANE2 as u32, 11, 12, 13, 14, 15]),
-        11 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16 + LANE2 as u32, 12, 13, 14, 15]),
-        12 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16 + LANE2 as u32, 13, 14, 15]),
-        13 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16 + LANE2 as u32, 14, 15]),
-        14 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16 + LANE2 as u32, 15]),
-        15 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16 + LANE2 as u32]),
+        0 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        1 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        2 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        3 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        4 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        5 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        6 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        7 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32, 8, 9, 10, 11, 12, 13, 14, 15]),
+        8 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 16 + LANE2 as u32, 9, 10, 11, 12, 13, 14, 15]),
+        9 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 16 + LANE2 as u32, 10, 11, 12, 13, 14, 15]),
+        10 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16 + LANE2 as u32, 11, 12, 13, 14, 15]),
+        11 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16 + LANE2 as u32, 12, 13, 14, 15]),
+        12 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16 + LANE2 as u32, 13, 14, 15]),
+        13 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16 + LANE2 as u32, 14, 15]),
+        14 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16 + LANE2 as u32, 15]),
+        15 => simd_shuffle16!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16 + LANE2 as u32]),
        _ => unreachable_unchecked(),
    }
 }
@@ -1928,16 +1928,16 @@ pub unsafe fn vcopyq_lane_p8<const LANE1: i32, const LANE2: i32>(a: poly8x16_t,
 pub unsafe fn vcopyq_lane_p16<const LANE1: i32, const LANE2: i32>(a: poly16x8_t, b: poly16x4_t) -> poly16x8_t {
    static_assert_imm3!(LANE1);
    static_assert_imm2!(LANE2);
-    let b: poly16x8_t = simd_shuffle8(b, b, [0, 1, 2, 3, 4, 5, 6, 7]);
+    let b: poly16x8_t = simd_shuffle8!(b, b, [0, 1, 2, 3, 4, 5, 6, 7]);
    match LANE1 & 0b111 {
-        0 => simd_shuffle8(a, b, [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
-        1 => simd_shuffle8(a, b, [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
-        2 => simd_shuffle8(a, b, [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]),
-        3 => simd_shuffle8(a, b, [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]),
-        4 => simd_shuffle8(a, b, [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]),
-        5 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]),
-        6 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]),
-        7 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]),
+        0 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]),
+        1 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]),
+        2 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]),
+        3 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]),
+        4 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]),
+        5 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]),
+        6 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]),
+        7 => simd_shuffle8!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]),
        _ => unreachable_unchecked(),
    }
 }
@@ -1950,10 +1950,10 @@ pub unsafe fn vcopyq_lane_p16<const LANE1: i32, const LANE2: i32>(a: poly16x8_t,
 pub unsafe fn vcopyq_lane_s64<const LANE1: i32, const LANE2: i32>(a: int64x2_t, b: int64x1_t) -> int64x2_t {
    static_assert_imm1!(LANE1);
    static_assert!(LANE2 : i32 where LANE2 == 0);
-    let b: int64x2_t = simd_shuffle2(b, b, [0, 1]);
+    let b: int64x2_t = simd_shuffle2!(b, b, [0, 1]);
    match LANE1 & 0b1 {
-        0 => simd_shuffle2(a, b, [2 + LANE2 as u32, 1]),
-        1 => simd_shuffle2(a, b, [0, 2 + LANE2 as u32]),
+        0 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [2 + LANE2 as u32, 1]),
+        1 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [0, 2 + LANE2 as u32]),
        _ => unreachable_unchecked(),
    }
 }
@@ -1966,10 +1966,10 @@ pub unsafe fn vcopyq_lane_s64<const LANE1: i32, const LANE2: i32>(a: int64x2_t,
 pub unsafe fn vcopyq_lane_u64<const LANE1: i32, const LANE2: i32>(a: uint64x2_t, b: uint64x1_t) -> uint64x2_t {
    static_assert_imm1!(LANE1);
    static_assert!(LANE2 : i32 where LANE2 == 0);
-    let b: uint64x2_t = simd_shuffle2(b, b, [0, 1]);
+    let b: uint64x2_t = simd_shuffle2!(b, b, [0, 1]);
    match LANE1 & 0b1 {
-        0 => simd_shuffle2(a, b, [2 + LANE2 as u32, 1]),
-        1 => simd_shuffle2(a, b, [0, 2 + LANE2 as u32]),
+        0 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [2 + LANE2 as u32, 1]),
+        1 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [0, 2 + LANE2 as u32]),
        _ => unreachable_unchecked(),
    }
 }
@@ -1982,10 +1982,10 @@ pub unsafe fn vcopyq_lane_u64<const LANE1: i32, const LANE2: i32>(a: uint64x2_t,
 pub unsafe fn vcopyq_lane_p64<const LANE1: i32, const LANE2: i32>(a: poly64x2_t, b: poly64x1_t) -> poly64x2_t {
    static_assert_imm1!(LANE1);
    static_assert!(LANE2 : i32 where LANE2 == 0);
-    let b: poly64x2_t = simd_shuffle2(b, b, [0, 1]);
+    let b: poly64x2_t = simd_shuffle2!(b, b, [0, 1]);
    match LANE1 & 0b1 {
-        0 => simd_shuffle2(a, b, [2 + LANE2 as u32, 1]),
-        1 => simd_shuffle2(a, b, [0, 2 + LANE2 as u32]),
+        0 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [2 + LANE2 as u32, 1]),
+        1 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [0, 2 + LANE2 as u32]),
        _ => unreachable_unchecked(),
    }
 }
@@ -1998,12 +1998,12 @@ pub unsafe fn vcopyq_lane_p64<const LANE1: i32, const LANE2: i32>(a: poly64x2_t,
 pub unsafe fn vcopyq_lane_f32<const LANE1: i32, const LANE2: i32>(a: float32x4_t, b: float32x2_t) -> float32x4_t {
    static_assert_imm2!(LANE1);
    static_assert_imm1!(LANE2);
-    let b: float32x4_t = simd_shuffle4(b, b, [0, 1, 2, 3]);
+    let b: float32x4_t = simd_shuffle4!(b, b, [0, 1, 2, 3]);
    match LANE1 & 0b11 {
-        0 => simd_shuffle4(a, b, [4 + LANE2 as u32, 1, 2, 3]),
-        1 => simd_shuffle4(a, b, [0, 4 + LANE2 as u32, 2, 3]),
-        2 => simd_shuffle4(a, b, [0, 1, 4 + LANE2 as u32, 3]),
-        3 => simd_shuffle4(a, b, [0, 1, 2, 4 + LANE2 as u32]),
+        0 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [4 + LANE2 as u32, 1, 2, 3]),
+        1 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 4 + LANE2 as u32, 2, 3]),
+        2 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 4 + LANE2 as u32, 3]),
+        3 => simd_shuffle4!(a, b, <const LANE1: i32, const LANE2: i32> [0, 1, 2, 4 + LANE2 as u32]),
        _ => unreachable_unchecked(),
    }
 }
@@ -2016,10 +2016,10 @@ pub unsafe fn vcopyq_lane_f32<const LANE1: i32, const LANE2: i32>(a: float32x4_t
 pub unsafe fn vcopyq_lane_f64<const LANE1: i32, const LANE2: i32>(a: float64x2_t, b: float64x1_t) -> float64x2_t {
    static_assert_imm1!(LANE1);
    static_assert!(LANE2 : i32 where LANE2 == 0);
-    let b: float64x2_t = simd_shuffle2(b, b, [0, 1]);
+    let b: float64x2_t = simd_shuffle2!(b, b, [0, 1]);
    match LANE1 & 0b1 {
-        0 => simd_shuffle2(a, b, [2 + LANE2 as u32, 1]),
-        1 => simd_shuffle2(a, b, [0, 2 + LANE2 as u32]),
+        0 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [2 + LANE2 as u32, 1]),
+        1 => simd_shuffle2!(a, b, <const LANE1: i32, const LANE2: i32> [0, 2 + LANE2 as u32]),
        _ => unreachable_unchecked(),
    }
 }
@@ -2077,7 +2077,7 @@ pub unsafe fn vcvt_f64_f32(a: float32x2_t) -> float64x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(fcvtl))]
 pub unsafe fn vcvt_high_f64_f32(a: float32x4_t) -> float64x2_t {
-    let b: float32x2_t = simd_shuffle2(a, a, [2, 3]);
+    let b: float32x2_t = simd_shuffle2!(a, a, [2, 3]);
    simd_cast(b)
 }

@@ -2094,7 +2094,7 @@ pub unsafe fn vcvt_f32_f64(a: float64x2_t) -> float32x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(fcvtn))]
 pub unsafe fn vcvt_high_f32_f64(a: float32x2_t, b: float64x2_t) -> float32x4_t {
-    simd_shuffle4(a, simd_cast(b), [0, 1, 2, 3])
+    simd_shuffle4!(a, simd_cast(b), [0, 1, 2, 3])
 }

 /// Floating-point convert to lower precision narrow, rounding to odd
@@ -2115,7 +2115,7 @@ pub unsafe fn vcvtx_f32_f64(a: float64x2_t) -> float32x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(fcvtxn))]
 pub unsafe fn vcvtx_high_f32_f64(a: float32x2_t, b: float64x2_t) -> float32x4_t {
-    simd_shuffle4(a, vcvtx_f32_f64(b), [0, 1, 2, 3])
+    simd_shuffle4!(a, vcvtx_f32_f64(b), [0, 1, 2, 3])
 }

 /// Fixed-point convert to floating-point
@@ -3085,7 +3085,7 @@ pub unsafe fn vcvtpd_u64_f64(a: f64) -> u64 {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vdupq_laneq_p64<const N: i32>(a: poly64x2_t) -> poly64x2_t {
    static_assert_imm1!(N);
-    simd_shuffle2(a, a, [N as u32, N as u32])
+    simd_shuffle2!(a, a, <const N: i32> [N as u32, N as u32])
 }

 /// Set all vector lanes to the same value
@@ -3095,7 +3095,7 @@ pub unsafe fn vdupq_laneq_p64<const N: i32>(a: poly64x2_t) -> poly64x2_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vdupq_lane_p64<const N: i32>(a: poly64x1_t) -> poly64x2_t {
    static_assert!(N : i32 where N == 0);
-    simd_shuffle2(a, a, [N as u32, N as u32])
+    simd_shuffle2!(a, a, <const N: i32> [N as u32, N as u32])
 }

 /// Set all vector lanes to the same value
@@ -3105,7 +3105,7 @@ pub unsafe fn vdupq_lane_p64<const N: i32>(a: poly64x1_t) -> poly64x2_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vdupq_laneq_f64<const N: i32>(a: float64x2_t) -> float64x2_t {
    static_assert_imm1!(N);
-    simd_shuffle2(a, a, [N as u32, N as u32])
+    simd_shuffle2!(a, a, <const N: i32> [N as u32, N as u32])
 }

 /// Set all vector lanes to the same value
@@ -3115,7 +3115,7 @@ pub unsafe fn vdupq_laneq_f64<const N: i32>(a: float64x2_t) -> float64x2_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vdupq_lane_f64<const N: i32>(a: float64x1_t) -> float64x2_t {
    static_assert!(N : i32 where N == 0);
-    simd_shuffle2(a, a, [N as u32, N as u32])
+    simd_shuffle2!(a, a, <const N: i32> [N as u32, N as u32])
 }

 /// Set all vector lanes to the same value
@@ -3406,8 +3406,8 @@ pub unsafe fn vdupd_laneq_f64<const N: i32>(a: float64x2_t) -> f64 {
 pub unsafe fn vextq_p64<const N: i32>(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t {
    static_assert_imm1!(N);
    match N & 0b1 {
-        0 => simd_shuffle2(a, b, [0, 1]),
-        1 => simd_shuffle2(a, b, [1, 2]),
+        0 => simd_shuffle2!(a, b, [0, 1]),
+        1 => simd_shuffle2!(a, b, [1, 2]),
        _ => unreachable_unchecked(),
    }
 }
@@ -3420,8 +3420,8 @@ pub unsafe fn vextq_p64<const N: i32>(a: poly64x2_t, b: poly64x2_t) -> poly64x2_
 pub unsafe fn vextq_f64<const N: i32>(a: float64x2_t, b: float64x2_t) -> float64x2_t {
    static_assert_imm1!(N);
    match N & 0b1 {
-        0 => simd_shuffle2(a, b, [0, 1]),
-        1 => simd_shuffle2(a, b, [1, 2]),
+        0 => simd_shuffle2!(a, b, [0, 1]),
+        1 => simd_shuffle2!(a, b, [1, 2]),
        _ => unreachable_unchecked(),
    }
 }
@@ -3447,8 +3447,8 @@ pub unsafe fn vmlaq_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(smlal2))]
 pub unsafe fn vmlal_high_s8(a: int16x8_t, b: int8x16_t, c: int8x16_t) -> int16x8_t {
-    let b: int8x8_t = simd_shuffle8(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
-    let c: int8x8_t = simd_shuffle8(c, c, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let b: int8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let c: int8x8_t = simd_shuffle8!(c, c, [8, 9, 10, 11, 12, 13, 14, 15]);
    vmlal_s8(a, b, c)
 }

@@ -3457,8 +3457,8 @@ pub unsafe fn vmlal_high_s8(a: int16x8_t, b: int8x16_t, c: int8x16_t) -> int16x8
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(smlal2))]
 pub unsafe fn vmlal_high_s16(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t {
-    let b: int16x4_t = simd_shuffle4(b, b, [4, 5, 6, 7]);
-    let c: int16x4_t = simd_shuffle4(c, c, [4, 5, 6, 7]);
+    let b: int16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
+    let c: int16x4_t = simd_shuffle4!(c, c, [4, 5, 6, 7]);
    vmlal_s16(a, b, c)
 }

@@ -3467,8 +3467,8 @@ pub unsafe fn vmlal_high_s16(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(smlal2))]
 pub unsafe fn vmlal_high_s32(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x2_t {
-    let b: int32x2_t = simd_shuffle2(b, b, [2, 3]);
-    let c: int32x2_t = simd_shuffle2(c, c, [2, 3]);
+    let b: int32x2_t = simd_shuffle2!(b, b, [2, 3]);
+    let c: int32x2_t = simd_shuffle2!(c, c, [2, 3]);
    vmlal_s32(a, b, c)
 }

@@ -3477,8 +3477,8 @@ pub unsafe fn vmlal_high_s32(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(umlal2))]
 pub unsafe fn vmlal_high_u8(a: uint16x8_t, b: uint8x16_t, c: uint8x16_t) -> uint16x8_t {
-    let b: uint8x8_t = simd_shuffle8(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
-    let c: uint8x8_t = simd_shuffle8(c, c, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let b: uint8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let c: uint8x8_t = simd_shuffle8!(c, c, [8, 9, 10, 11, 12, 13, 14, 15]);
    vmlal_u8(a, b, c)
 }

@@ -3487,8 +3487,8 @@ pub unsafe fn vmlal_high_u8(a: uint16x8_t, b: uint8x16_t, c: uint8x16_t) -> uint
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(umlal2))]
 pub unsafe fn vmlal_high_u16(a: uint32x4_t, b: uint16x8_t, c: uint16x8_t) -> uint32x4_t {
-    let b: uint16x4_t = simd_shuffle4(b, b, [4, 5, 6, 7]);
-    let c: uint16x4_t = simd_shuffle4(c, c, [4, 5, 6, 7]);
+    let b: uint16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
+    let c: uint16x4_t = simd_shuffle4!(c, c, [4, 5, 6, 7]);
    vmlal_u16(a, b, c)
 }

@@ -3497,8 +3497,8 @@ pub unsafe fn vmlal_high_u16(a: uint32x4_t, b: uint16x8_t, c: uint16x8_t) -> uin
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(umlal2))]
 pub unsafe fn vmlal_high_u32(a: uint64x2_t, b: uint32x4_t, c: uint32x4_t) -> uint64x2_t {
-    let b: uint32x2_t = simd_shuffle2(b, b, [2, 3]);
-    let c: uint32x2_t = simd_shuffle2(c, c, [2, 3]);
+    let b: uint32x2_t = simd_shuffle2!(b, b, [2, 3]);
+    let c: uint32x2_t = simd_shuffle2!(c, c, [2, 3]);
    vmlal_u32(a, b, c)
 }

@@ -3541,7 +3541,7 @@ pub unsafe fn vmlal_high_n_u32(a: uint64x2_t, b: uint32x4_t, c: u32) -> uint64x2
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlal_high_lane_s16<const LANE: i32>(a: int32x4_t, b: int16x8_t, c: int16x4_t) -> int32x4_t {
    static_assert_imm2!(LANE);
-    vmlal_high_s16(a, b, simd_shuffle8(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmlal_high_s16(a, b, simd_shuffle8!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Multiply-add long
@@ -3551,7 +3551,7 @@ pub unsafe fn vmlal_high_lane_s16<const LANE: i32>(a: int32x4_t, b: int16x8_t, c
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlal_high_laneq_s16<const LANE: i32>(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t {
    static_assert_imm3!(LANE);
-    vmlal_high_s16(a, b, simd_shuffle8(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmlal_high_s16(a, b, simd_shuffle8!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Multiply-add long
@@ -3561,7 +3561,7 @@ pub unsafe fn vmlal_high_laneq_s16<const LANE: i32>(a: int32x4_t, b: int16x8_t,
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlal_high_lane_s32<const LANE: i32>(a: int64x2_t, b: int32x4_t, c: int32x2_t) -> int64x2_t {
    static_assert_imm1!(LANE);
-    vmlal_high_s32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmlal_high_s32(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Multiply-add long
@@ -3571,7 +3571,7 @@ pub unsafe fn vmlal_high_lane_s32<const LANE: i32>(a: int64x2_t, b: int32x4_t, c
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlal_high_laneq_s32<const LANE: i32>(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x2_t {
    static_assert_imm2!(LANE);
-    vmlal_high_s32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmlal_high_s32(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Multiply-add long
@@ -3581,7 +3581,7 @@ pub unsafe fn vmlal_high_laneq_s32<const LANE: i32>(a: int64x2_t, b: int32x4_t,
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlal_high_lane_u16<const LANE: i32>(a: uint32x4_t, b: uint16x8_t, c: uint16x4_t) -> uint32x4_t {
    static_assert_imm2!(LANE);
-    vmlal_high_u16(a, b, simd_shuffle8(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmlal_high_u16(a, b, simd_shuffle8!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Multiply-add long
@@ -3591,7 +3591,7 @@ pub unsafe fn vmlal_high_lane_u16<const LANE: i32>(a: uint32x4_t, b: uint16x8_t,
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlal_high_laneq_u16<const LANE: i32>(a: uint32x4_t, b: uint16x8_t, c: uint16x8_t) -> uint32x4_t {
    static_assert_imm3!(LANE);
-    vmlal_high_u16(a, b, simd_shuffle8(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmlal_high_u16(a, b, simd_shuffle8!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Multiply-add long
@@ -3601,7 +3601,7 @@ pub unsafe fn vmlal_high_laneq_u16<const LANE: i32>(a: uint32x4_t, b: uint16x8_t
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlal_high_lane_u32<const LANE: i32>(a: uint64x2_t, b: uint32x4_t, c: uint32x2_t) -> uint64x2_t {
    static_assert_imm1!(LANE);
-    vmlal_high_u32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmlal_high_u32(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Multiply-add long
@@ -3611,7 +3611,7 @@ pub unsafe fn vmlal_high_lane_u32<const LANE: i32>(a: uint64x2_t, b: uint32x4_t,
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlal_high_laneq_u32<const LANE: i32>(a: uint64x2_t, b: uint32x4_t, c: uint32x4_t) -> uint64x2_t {
    static_assert_imm2!(LANE);
-    vmlal_high_u32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmlal_high_u32(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Floating-point multiply-subtract from accumulator
@@ -3635,8 +3635,8 @@ pub unsafe fn vmlsq_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(smlsl2))]
 pub unsafe fn vmlsl_high_s8(a: int16x8_t, b: int8x16_t, c: int8x16_t) -> int16x8_t {
-    let b: int8x8_t = simd_shuffle8(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
-    let c: int8x8_t = simd_shuffle8(c, c, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let b: int8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let c: int8x8_t = simd_shuffle8!(c, c, [8, 9, 10, 11, 12, 13, 14, 15]);
    vmlsl_s8(a, b, c)
 }

@@ -3645,8 +3645,8 @@ pub unsafe fn vmlsl_high_s8(a: int16x8_t, b: int8x16_t, c: int8x16_t) -> int16x8
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(smlsl2))]
 pub unsafe fn vmlsl_high_s16(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t {
-    let b: int16x4_t = simd_shuffle4(b, b, [4, 5, 6, 7]);
-    let c: int16x4_t = simd_shuffle4(c, c, [4, 5, 6, 7]);
+    let b: int16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
+    let c: int16x4_t = simd_shuffle4!(c, c, [4, 5, 6, 7]);
    vmlsl_s16(a, b, c)
 }

@@ -3655,8 +3655,8 @@ pub unsafe fn vmlsl_high_s16(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(smlsl2))]
 pub unsafe fn vmlsl_high_s32(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x2_t {
-    let b: int32x2_t = simd_shuffle2(b, b, [2, 3]);
-    let c: int32x2_t = simd_shuffle2(c, c, [2, 3]);
+    let b: int32x2_t = simd_shuffle2!(b, b, [2, 3]);
+    let c: int32x2_t = simd_shuffle2!(c, c, [2, 3]);
    vmlsl_s32(a, b, c)
 }

@@ -3665,8 +3665,8 @@ pub unsafe fn vmlsl_high_s32(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(umlsl2))]
 pub unsafe fn vmlsl_high_u8(a: uint16x8_t, b: uint8x16_t, c: uint8x16_t) -> uint16x8_t {
-    let b: uint8x8_t = simd_shuffle8(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
-    let c: uint8x8_t = simd_shuffle8(c, c, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let b: uint8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let c: uint8x8_t = simd_shuffle8!(c, c, [8, 9, 10, 11, 12, 13, 14, 15]);
    vmlsl_u8(a, b, c)
 }

@@ -3675,8 +3675,8 @@ pub unsafe fn vmlsl_high_u8(a: uint16x8_t, b: uint8x16_t, c: uint8x16_t) -> uint
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(umlsl2))]
 pub unsafe fn vmlsl_high_u16(a: uint32x4_t, b: uint16x8_t, c: uint16x8_t) -> uint32x4_t {
-    let b: uint16x4_t = simd_shuffle4(b, b, [4, 5, 6, 7]);
-    let c: uint16x4_t = simd_shuffle4(c, c, [4, 5, 6, 7]);
+    let b: uint16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
+    let c: uint16x4_t = simd_shuffle4!(c, c, [4, 5, 6, 7]);
    vmlsl_u16(a, b, c)
 }

@@ -3685,8 +3685,8 @@ pub unsafe fn vmlsl_high_u16(a: uint32x4_t, b: uint16x8_t, c: uint16x8_t) -> uin
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(umlsl2))]
 pub unsafe fn vmlsl_high_u32(a: uint64x2_t, b: uint32x4_t, c: uint32x4_t) -> uint64x2_t {
-    let b: uint32x2_t = simd_shuffle2(b, b, [2, 3]);
-    let c: uint32x2_t = simd_shuffle2(c, c, [2, 3]);
+    let b: uint32x2_t = simd_shuffle2!(b, b, [2, 3]);
+    let c: uint32x2_t = simd_shuffle2!(c, c, [2, 3]);
    vmlsl_u32(a, b, c)
 }

@@ -3729,7 +3729,7 @@ pub unsafe fn vmlsl_high_n_u32(a: uint64x2_t, b: uint32x4_t, c: u32) -> uint64x2
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlsl_high_lane_s16<const LANE: i32>(a: int32x4_t, b: int16x8_t, c: int16x4_t) -> int32x4_t {
    static_assert_imm2!(LANE);
-    vmlsl_high_s16(a, b, simd_shuffle8(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmlsl_high_s16(a, b, simd_shuffle8!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Multiply-subtract long
@@ -3739,7 +3739,7 @@ pub unsafe fn vmlsl_high_lane_s16<const LANE: i32>(a: int32x4_t, b: int16x8_t, c
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlsl_high_laneq_s16<const LANE: i32>(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t {
    static_assert_imm3!(LANE);
-    vmlsl_high_s16(a, b, simd_shuffle8(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmlsl_high_s16(a, b, simd_shuffle8!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Multiply-subtract long
@@ -3749,7 +3749,7 @@ pub unsafe fn vmlsl_high_laneq_s16<const LANE: i32>(a: int32x4_t, b: int16x8_t,
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlsl_high_lane_s32<const LANE: i32>(a: int64x2_t, b: int32x4_t, c: int32x2_t) -> int64x2_t {
    static_assert_imm1!(LANE);
-    vmlsl_high_s32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmlsl_high_s32(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Multiply-subtract long
@@ -3759,7 +3759,7 @@ pub unsafe fn vmlsl_high_lane_s32<const LANE: i32>(a: int64x2_t, b: int32x4_t, c
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlsl_high_laneq_s32<const LANE: i32>(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x2_t {
    static_assert_imm2!(LANE);
-    vmlsl_high_s32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmlsl_high_s32(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Multiply-subtract long
@@ -3769,7 +3769,7 @@ pub unsafe fn vmlsl_high_laneq_s32<const LANE: i32>(a: int64x2_t, b: int32x4_t,
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlsl_high_lane_u16<const LANE: i32>(a: uint32x4_t, b: uint16x8_t, c: uint16x4_t) -> uint32x4_t {
    static_assert_imm2!(LANE);
-    vmlsl_high_u16(a, b, simd_shuffle8(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmlsl_high_u16(a, b, simd_shuffle8!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Multiply-subtract long
@@ -3779,7 +3779,7 @@ pub unsafe fn vmlsl_high_lane_u16<const LANE: i32>(a: uint32x4_t, b: uint16x8_t,
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlsl_high_laneq_u16<const LANE: i32>(a: uint32x4_t, b: uint16x8_t, c: uint16x8_t) -> uint32x4_t {
    static_assert_imm3!(LANE);
-    vmlsl_high_u16(a, b, simd_shuffle8(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmlsl_high_u16(a, b, simd_shuffle8!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Multiply-subtract long
@@ -3789,7 +3789,7 @@ pub unsafe fn vmlsl_high_laneq_u16<const LANE: i32>(a: uint32x4_t, b: uint16x8_t
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlsl_high_lane_u32<const LANE: i32>(a: uint64x2_t, b: uint32x4_t, c: uint32x2_t) -> uint64x2_t {
    static_assert_imm1!(LANE);
-    vmlsl_high_u32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmlsl_high_u32(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Multiply-subtract long
@@ -3799,7 +3799,7 @@ pub unsafe fn vmlsl_high_lane_u32<const LANE: i32>(a: uint64x2_t, b: uint32x4_t,
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlsl_high_laneq_u32<const LANE: i32>(a: uint64x2_t, b: uint32x4_t, c: uint32x4_t) -> uint64x2_t {
    static_assert_imm2!(LANE);
-    vmlsl_high_u32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmlsl_high_u32(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Extract narrow
@@ -3808,7 +3808,7 @@ pub unsafe fn vmlsl_high_laneq_u32<const LANE: i32>(a: uint64x2_t, b: uint32x4_t
 #[cfg_attr(test, assert_instr(xtn2))]
 pub unsafe fn vmovn_high_s16(a: int8x8_t, b: int16x8_t) -> int8x16_t {
    let c: int8x8_t = simd_cast(b);
-    simd_shuffle16(a, c, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+    simd_shuffle16!(a, c, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
 }

 /// Extract narrow
@@ -3817,7 +3817,7 @@ pub unsafe fn vmovn_high_s16(a: int8x8_t, b: int16x8_t) -> int8x16_t {
 #[cfg_attr(test, assert_instr(xtn2))]
 pub unsafe fn vmovn_high_s32(a: int16x4_t, b: int32x4_t) -> int16x8_t {
    let c: int16x4_t = simd_cast(b);
-    simd_shuffle8(a, c, [0, 1, 2, 3, 4, 5, 6, 7])
+    simd_shuffle8!(a, c, [0, 1, 2, 3, 4, 5, 6, 7])
 }

 /// Extract narrow
@@ -3826,7 +3826,7 @@ pub unsafe fn vmovn_high_s32(a: int16x4_t, b: int32x4_t) -> int16x8_t {
 #[cfg_attr(test, assert_instr(xtn2))]
 pub unsafe fn vmovn_high_s64(a: int32x2_t, b: int64x2_t) -> int32x4_t {
    let c: int32x2_t = simd_cast(b);
-    simd_shuffle4(a, c, [0, 1, 2, 3])
+    simd_shuffle4!(a, c, [0, 1, 2, 3])
 }

 /// Extract narrow
@@ -3835,7 +3835,7 @@ pub unsafe fn vmovn_high_s64(a: int32x2_t, b: int64x2_t) -> int32x4_t {
 #[cfg_attr(test, assert_instr(xtn2))]
 pub unsafe fn vmovn_high_u16(a: uint8x8_t, b: uint16x8_t) -> uint8x16_t {
    let c: uint8x8_t = simd_cast(b);
-    simd_shuffle16(a, c, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+    simd_shuffle16!(a, c, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
 }

 /// Extract narrow
@@ -3844,7 +3844,7 @@ pub unsafe fn vmovn_high_u16(a: uint8x8_t, b: uint16x8_t) -> uint8x16_t {
 #[cfg_attr(test, assert_instr(xtn2))]
 pub unsafe fn vmovn_high_u32(a: uint16x4_t, b: uint32x4_t) -> uint16x8_t {
    let c: uint16x4_t = simd_cast(b);
-    simd_shuffle8(a, c, [0, 1, 2, 3, 4, 5, 6, 7])
+    simd_shuffle8!(a, c, [0, 1, 2, 3, 4, 5, 6, 7])
 }

 /// Extract narrow
@@ -3853,7 +3853,7 @@ pub unsafe fn vmovn_high_u32(a: uint16x4_t, b: uint32x4_t) -> uint16x8_t {
 #[cfg_attr(test, assert_instr(xtn2))]
 pub unsafe fn vmovn_high_u64(a: uint32x2_t, b: uint64x2_t) -> uint32x4_t {
    let c: uint32x2_t = simd_cast(b);
-    simd_shuffle4(a, c, [0, 1, 2, 3])
+    simd_shuffle4!(a, c, [0, 1, 2, 3])
 }

 /// Negate
@@ -4529,7 +4529,7 @@ pub unsafe fn vmul_laneq_f64<const LANE: i32>(a: float64x1_t, b: float64x2_t) ->
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vmulq_lane_f64<const LANE: i32>(a: float64x2_t, b: float64x1_t) -> float64x2_t {
    static_assert!(LANE : i32 where LANE == 0);
-    simd_mul(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32]))
+    simd_mul(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
 }

 /// Floating-point multiply
@@ -4539,7 +4539,7 @@ pub unsafe fn vmulq_lane_f64<const LANE: i32>(a: float64x2_t, b: float64x1_t) ->
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vmulq_laneq_f64<const LANE: i32>(a: float64x2_t, b: float64x2_t) -> float64x2_t {
    static_assert_imm1!(LANE);
-    simd_mul(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32]))
+    simd_mul(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
 }

 /// Floating-point multiply
@@ -4591,8 +4591,8 @@ pub unsafe fn vmuld_laneq_f64<const LANE: i32>(a: f64, b: float64x2_t) -> f64 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(smull2))]
 pub unsafe fn vmull_high_s8(a: int8x16_t, b: int8x16_t) -> int16x8_t {
-    let a: int8x8_t = simd_shuffle8(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
-    let b: int8x8_t = simd_shuffle8(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let a: int8x8_t = simd_shuffle8!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let b: int8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
    vmull_s8(a, b)
 }

@@ -4601,8 +4601,8 @@ pub unsafe fn vmull_high_s8(a: int8x16_t, b: int8x16_t) -> int16x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(smull2))]
 pub unsafe fn vmull_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t {
-    let a: int16x4_t = simd_shuffle4(a, a, [4, 5, 6, 7]);
-    let b: int16x4_t = simd_shuffle4(b, b, [4, 5, 6, 7]);
+    let a: int16x4_t = simd_shuffle4!(a, a, [4, 5, 6, 7]);
+    let b: int16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
    vmull_s16(a, b)
 }

@@ -4611,8 +4611,8 @@ pub unsafe fn vmull_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(smull2))]
 pub unsafe fn vmull_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t {
-    let a: int32x2_t = simd_shuffle2(a, a, [2, 3]);
-    let b: int32x2_t = simd_shuffle2(b, b, [2, 3]);
+    let a: int32x2_t = simd_shuffle2!(a, a, [2, 3]);
+    let b: int32x2_t = simd_shuffle2!(b, b, [2, 3]);
    vmull_s32(a, b)
 }

@@ -4621,8 +4621,8 @@ pub unsafe fn vmull_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(umull2))]
 pub unsafe fn vmull_high_u8(a: uint8x16_t, b: uint8x16_t) -> uint16x8_t {
-    let a: uint8x8_t = simd_shuffle8(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
-    let b: uint8x8_t = simd_shuffle8(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let a: uint8x8_t = simd_shuffle8!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let b: uint8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
    vmull_u8(a, b)
 }

@@ -4631,8 +4631,8 @@ pub unsafe fn vmull_high_u8(a: uint8x16_t, b: uint8x16_t) -> uint16x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(umull2))]
 pub unsafe fn vmull_high_u16(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t {
-    let a: uint16x4_t = simd_shuffle4(a, a, [4, 5, 6, 7]);
-    let b: uint16x4_t = simd_shuffle4(b, b, [4, 5, 6, 7]);
+    let a: uint16x4_t = simd_shuffle4!(a, a, [4, 5, 6, 7]);
+    let b: uint16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
    vmull_u16(a, b)
 }

@@ -4641,8 +4641,8 @@ pub unsafe fn vmull_high_u16(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(umull2))]
 pub unsafe fn vmull_high_u32(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t {
-    let a: uint32x2_t = simd_shuffle2(a, a, [2, 3]);
-    let b: uint32x2_t = simd_shuffle2(b, b, [2, 3]);
+    let a: uint32x2_t = simd_shuffle2!(a, a, [2, 3]);
+    let b: uint32x2_t = simd_shuffle2!(b, b, [2, 3]);
    vmull_u32(a, b)
 }

@@ -4651,8 +4651,8 @@ pub unsafe fn vmull_high_u32(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(pmull))]
 pub unsafe fn vmull_high_p8(a: poly8x16_t, b: poly8x16_t) -> poly16x8_t {
-    let a: poly8x8_t = simd_shuffle8(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
-    let b: poly8x8_t = simd_shuffle8(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let a: poly8x8_t = simd_shuffle8!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let b: poly8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
    vmull_p8(a, b)
 }

@@ -4695,7 +4695,7 @@ pub unsafe fn vmull_high_n_u32(a: uint32x4_t, b: u32) -> uint64x2_t {
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vmull_high_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x4_t) -> int32x4_t {
    static_assert_imm2!(LANE);
-    vmull_high_s16(a, simd_shuffle8(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmull_high_s16(a, simd_shuffle8!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Multiply long
@@ -4705,7 +4705,7 @@ pub unsafe fn vmull_high_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x4_t) -
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vmull_high_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t) -> int32x4_t {
    static_assert_imm3!(LANE);
-    vmull_high_s16(a, simd_shuffle8(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmull_high_s16(a, simd_shuffle8!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Multiply long
@@ -4715,7 +4715,7 @@ pub unsafe fn vmull_high_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t)
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vmull_high_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x2_t) -> int64x2_t {
    static_assert_imm1!(LANE);
-    vmull_high_s32(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmull_high_s32(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Multiply long
@@ -4725,7 +4725,7 @@ pub unsafe fn vmull_high_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x2_t) -
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vmull_high_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t) -> int64x2_t {
    static_assert_imm2!(LANE);
-    vmull_high_s32(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmull_high_s32(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Multiply long
@@ -4735,7 +4735,7 @@ pub unsafe fn vmull_high_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t)
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vmull_high_lane_u16<const LANE: i32>(a: uint16x8_t, b: uint16x4_t) -> uint32x4_t {
    static_assert_imm2!(LANE);
-    vmull_high_u16(a, simd_shuffle8(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmull_high_u16(a, simd_shuffle8!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Multiply long
@@ -4745,7 +4745,7 @@ pub unsafe fn vmull_high_lane_u16<const LANE: i32>(a: uint16x8_t, b: uint16x4_t)
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vmull_high_laneq_u16<const LANE: i32>(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t {
    static_assert_imm3!(LANE);
-    vmull_high_u16(a, simd_shuffle8(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmull_high_u16(a, simd_shuffle8!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Multiply long
@@ -4755,7 +4755,7 @@ pub unsafe fn vmull_high_laneq_u16<const LANE: i32>(a: uint16x8_t, b: uint16x8_t
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vmull_high_lane_u32<const LANE: i32>(a: uint32x4_t, b: uint32x2_t) -> uint64x2_t {
    static_assert_imm1!(LANE);
-    vmull_high_u32(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmull_high_u32(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Multiply long
@@ -4765,7 +4765,7 @@ pub unsafe fn vmull_high_lane_u32<const LANE: i32>(a: uint32x4_t, b: uint32x2_t)
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vmull_high_laneq_u32<const LANE: i32>(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t {
    static_assert_imm2!(LANE);
-    vmull_high_u32(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmull_high_u32(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Floating-point multiply extended
@@ -4847,7 +4847,7 @@ pub unsafe fn vmulx_laneq_f64<const LANE: i32>(a: float64x1_t, b: float64x2_t) -
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vmulx_lane_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t) -> float32x2_t {
    static_assert_imm1!(LANE);
-    vmulx_f32(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32]))
+    vmulx_f32(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
 }

 /// Floating-point multiply extended
@@ -4857,7 +4857,7 @@ pub unsafe fn vmulx_lane_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t) ->
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vmulx_laneq_f32<const LANE: i32>(a: float32x2_t, b: float32x4_t) -> float32x2_t {
    static_assert_imm2!(LANE);
-    vmulx_f32(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32]))
+    vmulx_f32(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
 }

 /// Floating-point multiply extended
@@ -4867,7 +4867,7 @@ pub unsafe fn vmulx_laneq_f32<const LANE: i32>(a: float32x2_t, b: float32x4_t) -
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vmulxq_lane_f32<const LANE: i32>(a: float32x4_t, b: float32x2_t) -> float32x4_t {
    static_assert_imm1!(LANE);
-    vmulxq_f32(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmulxq_f32(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Floating-point multiply extended
@@ -4877,7 +4877,7 @@ pub unsafe fn vmulxq_lane_f32<const LANE: i32>(a: float32x4_t, b: float32x2_t) -
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vmulxq_laneq_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t) -> float32x4_t {
    static_assert_imm2!(LANE);
-    vmulxq_f32(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmulxq_f32(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Floating-point multiply extended
@@ -4887,7 +4887,7 @@ pub unsafe fn vmulxq_laneq_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t)
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vmulxq_lane_f64<const LANE: i32>(a: float64x2_t, b: float64x1_t) -> float64x2_t {
    static_assert!(LANE : i32 where LANE == 0);
-    vmulxq_f64(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32]))
+    vmulxq_f64(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
 }

 /// Floating-point multiply extended
@@ -4897,7 +4897,7 @@ pub unsafe fn vmulxq_lane_f64<const LANE: i32>(a: float64x2_t, b: float64x1_t) -
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vmulxq_laneq_f64<const LANE: i32>(a: float64x2_t, b: float64x2_t) -> float64x2_t {
    static_assert_imm1!(LANE);
-    vmulxq_f64(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32]))
+    vmulxq_f64(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
 }

 /// Floating-point multiply extended
@@ -5167,7 +5167,7 @@ pub unsafe fn vaddlvq_u32(a: uint32x4_t) -> u64 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ssubw))]
 pub unsafe fn vsubw_high_s8(a: int16x8_t, b: int8x16_t) -> int16x8_t {
-    let c: int8x8_t = simd_shuffle8(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let c: int8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
    simd_sub(a, simd_cast(c))
 }

@@ -5176,7 +5176,7 @@ pub unsafe fn vsubw_high_s8(a: int16x8_t, b: int8x16_t) -> int16x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ssubw))]
 pub unsafe fn vsubw_high_s16(a: int32x4_t, b: int16x8_t) -> int32x4_t {
-    let c: int16x4_t = simd_shuffle4(b, b, [4, 5, 6, 7]);
+    let c: int16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
    simd_sub(a, simd_cast(c))
 }

@@ -5185,7 +5185,7 @@ pub unsafe fn vsubw_high_s16(a: int32x4_t, b: int16x8_t) -> int32x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ssubw))]
 pub unsafe fn vsubw_high_s32(a: int64x2_t, b: int32x4_t) -> int64x2_t {
-    let c: int32x2_t = simd_shuffle2(b, b, [2, 3]);
+    let c: int32x2_t = simd_shuffle2!(b, b, [2, 3]);
    simd_sub(a, simd_cast(c))
 }

@@ -5194,7 +5194,7 @@ pub unsafe fn vsubw_high_s32(a: int64x2_t, b: int32x4_t) -> int64x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(usubw))]
 pub unsafe fn vsubw_high_u8(a: uint16x8_t, b: uint8x16_t) -> uint16x8_t {
-    let c: uint8x8_t = simd_shuffle8(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let c: uint8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
    simd_sub(a, simd_cast(c))
 }

@@ -5203,7 +5203,7 @@ pub unsafe fn vsubw_high_u8(a: uint16x8_t, b: uint8x16_t) -> uint16x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(usubw))]
 pub unsafe fn vsubw_high_u16(a: uint32x4_t, b: uint16x8_t) -> uint32x4_t {
-    let c: uint16x4_t = simd_shuffle4(b, b, [4, 5, 6, 7]);
+    let c: uint16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
    simd_sub(a, simd_cast(c))
 }

@@ -5212,7 +5212,7 @@ pub unsafe fn vsubw_high_u16(a: uint32x4_t, b: uint16x8_t) -> uint32x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(usubw))]
 pub unsafe fn vsubw_high_u32(a: uint64x2_t, b: uint32x4_t) -> uint64x2_t {
-    let c: uint32x2_t = simd_shuffle2(b, b, [2, 3]);
+    let c: uint32x2_t = simd_shuffle2!(b, b, [2, 3]);
    simd_sub(a, simd_cast(c))
 }

@@ -5221,9 +5221,9 @@ pub unsafe fn vsubw_high_u32(a: uint64x2_t, b: uint32x4_t) -> uint64x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ssubl))]
 pub unsafe fn vsubl_high_s8(a: int8x16_t, b: int8x16_t) -> int16x8_t {
-    let c: int8x8_t = simd_shuffle8(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let c: int8x8_t = simd_shuffle8!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
    let d: int16x8_t = simd_cast(c);
-    let e: int8x8_t = simd_shuffle8(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let e: int8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
    let f: int16x8_t = simd_cast(e);
    simd_sub(d, f)
 }
@@ -5233,9 +5233,9 @@ pub unsafe fn vsubl_high_s8(a: int8x16_t, b: int8x16_t) -> int16x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ssubl))]
 pub unsafe fn vsubl_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t {
-    let c: int16x4_t = simd_shuffle4(a, a, [4, 5, 6, 7]);
+    let c: int16x4_t = simd_shuffle4!(a, a, [4, 5, 6, 7]);
    let d: int32x4_t = simd_cast(c);
-    let e: int16x4_t = simd_shuffle4(b, b, [4, 5, 6, 7]);
+    let e: int16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
    let f: int32x4_t = simd_cast(e);
    simd_sub(d, f)
 }
@@ -5245,9 +5245,9 @@ pub unsafe fn vsubl_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ssubl))]
 pub unsafe fn vsubl_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t {
-    let c: int32x2_t = simd_shuffle2(a, a, [2, 3]);
+    let c: int32x2_t = simd_shuffle2!(a, a, [2, 3]);
    let d: int64x2_t = simd_cast(c);
-    let e: int32x2_t = simd_shuffle2(b, b, [2, 3]);
+    let e: int32x2_t = simd_shuffle2!(b, b, [2, 3]);
    let f: int64x2_t = simd_cast(e);
    simd_sub(d, f)
 }
@@ -5257,9 +5257,9 @@ pub unsafe fn vsubl_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(usubl))]
 pub unsafe fn vsubl_high_u8(a: uint8x16_t, b: uint8x16_t) -> uint16x8_t {
-    let c: uint8x8_t = simd_shuffle8(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let c: uint8x8_t = simd_shuffle8!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
    let d: uint16x8_t = simd_cast(c);
-    let e: uint8x8_t = simd_shuffle8(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let e: uint8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
    let f: uint16x8_t = simd_cast(e);
    simd_sub(d, f)
 }
@@ -5269,9 +5269,9 @@ pub unsafe fn vsubl_high_u8(a: uint8x16_t, b: uint8x16_t) -> uint16x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(usubl))]
 pub unsafe fn vsubl_high_u16(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t {
-    let c: uint16x4_t = simd_shuffle4(a, a, [4, 5, 6, 7]);
+    let c: uint16x4_t = simd_shuffle4!(a, a, [4, 5, 6, 7]);
    let d: uint32x4_t = simd_cast(c);
-    let e: uint16x4_t = simd_shuffle4(b, b, [4, 5, 6, 7]);
+    let e: uint16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
    let f: uint32x4_t = simd_cast(e);
    simd_sub(d, f)
 }
@@ -5281,9 +5281,9 @@ pub unsafe fn vsubl_high_u16(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(usubl))]
 pub unsafe fn vsubl_high_u32(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t {
-    let c: uint32x2_t = simd_shuffle2(a, a, [2, 3]);
+    let c: uint32x2_t = simd_shuffle2!(a, a, [2, 3]);
    let d: uint64x2_t = simd_cast(c);
-    let e: uint32x2_t = simd_shuffle2(b, b, [2, 3]);
+    let e: uint32x2_t = simd_shuffle2!(b, b, [2, 3]);
    let f: uint64x2_t = simd_cast(e);
    simd_sub(d, f)
 }
@@ -5498,8 +5498,8 @@ pub unsafe fn vqdmulls_s32(a: i32, b: i32) -> i64 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(sqdmull2))]
 pub unsafe fn vqdmull_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t {
-    let a: int16x4_t = simd_shuffle4(a, a, [4, 5, 6, 7]);
-    let b: int16x4_t = simd_shuffle4(b, b, [4, 5, 6, 7]);
+    let a: int16x4_t = simd_shuffle4!(a, a, [4, 5, 6, 7]);
+    let b: int16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
    vqdmull_s16(a, b)
 }

@@ -5508,8 +5508,8 @@ pub unsafe fn vqdmull_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(sqdmull2))]
 pub unsafe fn vqdmull_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t {
-    let a: int32x2_t = simd_shuffle2(a, a, [2, 3]);
-    let b: int32x2_t = simd_shuffle2(b, b, [2, 3]);
+    let a: int32x2_t = simd_shuffle2!(a, a, [2, 3]);
+    let b: int32x2_t = simd_shuffle2!(b, b, [2, 3]);
    vqdmull_s32(a, b)
 }

@@ -5518,7 +5518,7 @@ pub unsafe fn vqdmull_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(sqdmull2))]
 pub unsafe fn vqdmull_high_n_s16(a: int16x8_t, b: i16) -> int32x4_t {
-    let a: int16x4_t = simd_shuffle4(a, a, [4, 5, 6, 7]);
+    let a: int16x4_t = simd_shuffle4!(a, a, [4, 5, 6, 7]);
    let b: int16x4_t = vdup_n_s16(b);
    vqdmull_s16(a, b)
 }
@@ -5528,7 +5528,7 @@ pub unsafe fn vqdmull_high_n_s16(a: int16x8_t, b: i16) -> int32x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(sqdmull2))]
 pub unsafe fn vqdmull_high_n_s32(a: int32x4_t, b: i32) -> int64x2_t {
-    let a: int32x2_t = simd_shuffle2(a, a, [2, 3]);
+    let a: int32x2_t = simd_shuffle2!(a, a, [2, 3]);
    let b: int32x2_t = vdup_n_s32(b);
    vqdmull_s32(a, b)
 }
@@ -5540,7 +5540,7 @@ pub unsafe fn vqdmull_high_n_s32(a: int32x4_t, b: i32) -> int64x2_t {
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vqdmull_laneq_s16<const N: i32>(a: int16x4_t, b: int16x8_t) -> int32x4_t {
    static_assert_imm3!(N);
-    let b: int16x4_t = simd_shuffle4(b, b, [N as u32, N as u32, N as u32, N as u32]);
+    let b: int16x4_t = simd_shuffle4!(b, b, <const N: i32> [N as u32, N as u32, N as u32, N as u32]);
    vqdmull_s16(a, b)
 }

@@ -5551,7 +5551,7 @@ pub unsafe fn vqdmull_laneq_s16<const N: i32>(a: int16x4_t, b: int16x8_t) -> int
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vqdmull_laneq_s32<const N: i32>(a: int32x2_t, b: int32x4_t) -> int64x2_t {
    static_assert_imm2!(N);
-    let b: int32x2_t = simd_shuffle2(b, b, [N as u32, N as u32]);
+    let b: int32x2_t = simd_shuffle2!(b, b, <const N: i32> [N as u32, N as u32]);
    vqdmull_s32(a, b)
 }

@@ -5606,8 +5606,8 @@ pub unsafe fn vqdmulls_laneq_s32<const N: i32>(a: i32, b: int32x4_t) -> i64 {
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vqdmull_high_lane_s16<const N: i32>(a: int16x8_t, b: int16x4_t) -> int32x4_t {
    static_assert_imm2!(N);
-    let a: int16x4_t = simd_shuffle4(a, a, [4, 5, 6, 7]);
-    let b: int16x4_t = simd_shuffle4(b, b, [N as u32, N as u32, N as u32, N as u32]);
+    let a: int16x4_t = simd_shuffle4!(a, a, [4, 5, 6, 7]);
+    let b: int16x4_t = simd_shuffle4!(b, b, <const N: i32> [N as u32, N as u32, N as u32, N as u32]);
    vqdmull_s16(a, b)
 }

@@ -5618,8 +5618,8 @@ pub unsafe fn vqdmull_high_lane_s16<const N: i32>(a: int16x8_t, b: int16x4_t) ->
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vqdmull_high_lane_s32<const N: i32>(a: int32x4_t, b: int32x2_t) -> int64x2_t {
    static_assert_imm1!(N);
-    let a: int32x2_t = simd_shuffle2(a, a, [2, 3]);
-    let b: int32x2_t = simd_shuffle2(b, b, [N as u32, N as u32]);
+    let a: int32x2_t = simd_shuffle2!(a, a, [2, 3]);
+    let b: int32x2_t = simd_shuffle2!(b, b, <const N: i32> [N as u32, N as u32]);
    vqdmull_s32(a, b)
 }

@@ -5630,8 +5630,8 @@ pub unsafe fn vqdmull_high_lane_s32<const N: i32>(a: int32x4_t, b: int32x2_t) ->
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vqdmull_high_laneq_s16<const N: i32>(a: int16x8_t, b: int16x8_t) -> int32x4_t {
    static_assert_imm3!(N);
-    let a: int16x4_t = simd_shuffle4(a, a, [4, 5, 6, 7]);
-    let b: int16x4_t = simd_shuffle4(b, b, [N as u32, N as u32, N as u32, N as u32]);
+    let a: int16x4_t = simd_shuffle4!(a, a, [4, 5, 6, 7]);
+    let b: int16x4_t = simd_shuffle4!(b, b, <const N: i32> [N as u32, N as u32, N as u32, N as u32]);
    vqdmull_s16(a, b)
 }

@@ -5642,8 +5642,8 @@ pub unsafe fn vqdmull_high_laneq_s16<const N: i32>(a: int16x8_t, b: int16x8_t) -
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vqdmull_high_laneq_s32<const N: i32>(a: int32x4_t, b: int32x4_t) -> int64x2_t {
    static_assert_imm2!(N);
-    let a: int32x2_t = simd_shuffle2(a, a, [2, 3]);
-    let b: int32x2_t = simd_shuffle2(b, b, [N as u32, N as u32]);
+    let a: int32x2_t = simd_shuffle2!(a, a, [2, 3]);
+    let b: int32x2_t = simd_shuffle2!(b, b, <const N: i32> [N as u32, N as u32]);
    vqdmull_s32(a, b)
 }

@@ -6183,7 +6183,7 @@ pub unsafe fn vqrshrnd_n_s64<const N: i32>(a: i64) -> i32 {
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vqrshrn_high_n_s16<const N: i32>(a: int8x8_t, b: int16x8_t) -> int8x16_t {
    static_assert!(N : i32 where N >= 1 && N <= 8);
-    simd_shuffle16(a, vqrshrn_n_s16::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+    simd_shuffle16!(a, vqrshrn_n_s16::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
 }

 /// Signed saturating rounded shift right narrow
@@ -6193,7 +6193,7 @@ pub unsafe fn vqrshrn_high_n_s16<const N: i32>(a: int8x8_t, b: int16x8_t) -> int
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vqrshrn_high_n_s32<const N: i32>(a: int16x4_t, b: int32x4_t) -> int16x8_t {
    static_assert!(N : i32 where N >= 1 && N <= 16);
-    simd_shuffle8(a, vqrshrn_n_s32::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7])
+    simd_shuffle8!(a, vqrshrn_n_s32::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7])
 }

 /// Signed saturating rounded shift right narrow
@@ -6203,7 +6203,7 @@ pub unsafe fn vqrshrn_high_n_s32<const N: i32>(a: int16x4_t, b: int32x4_t) -> in
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vqrshrn_high_n_s64<const N: i32>(a: int32x2_t, b: int64x2_t) -> int32x4_t {
    static_assert!(N : i32 where N >= 1 && N <= 32);
-    simd_shuffle4(a, vqrshrn_n_s64::<N>(b), [0, 1, 2, 3])
+    simd_shuffle4!(a, vqrshrn_n_s64::<N>(b), [0, 1, 2, 3])
 }

 /// Unsigned saturating rounded shift right narrow
@@ -6246,7 +6246,7 @@ pub unsafe fn vqrshrnd_n_u64<const N: i32>(a: u64) -> u32 {
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vqrshrn_high_n_u16<const N: i32>(a: uint8x8_t, b: uint16x8_t) -> uint8x16_t {
    static_assert!(N : i32 where N >= 1 && N <= 8);
-    simd_shuffle16(a, vqrshrn_n_u16::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+    simd_shuffle16!(a, vqrshrn_n_u16::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
 }

 /// Unsigned saturating rounded shift right narrow
@@ -6256,7 +6256,7 @@ pub unsafe fn vqrshrn_high_n_u16<const N: i32>(a: uint8x8_t, b: uint16x8_t) -> u
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vqrshrn_high_n_u32<const N: i32>(a: uint16x4_t, b: uint32x4_t) -> uint16x8_t {
    static_assert!(N : i32 where N >= 1 && N <= 16);
-    simd_shuffle8(a, vqrshrn_n_u32::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7])
+    simd_shuffle8!(a, vqrshrn_n_u32::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7])
 }

 /// Unsigned saturating rounded shift right narrow
@@ -6266,7 +6266,7 @@ pub unsafe fn vqrshrn_high_n_u32<const N: i32>(a: uint16x4_t, b: uint32x4_t) ->
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vqrshrn_high_n_u64<const N: i32>(a: uint32x2_t, b: uint64x2_t) -> uint32x4_t {
    static_assert!(N : i32 where N >= 1 && N <= 32);
-    simd_shuffle4(a, vqrshrn_n_u64::<N>(b), [0, 1, 2, 3])
+    simd_shuffle4!(a, vqrshrn_n_u64::<N>(b), [0, 1, 2, 3])
 }

 /// Signed saturating rounded shift right unsigned narrow
@@ -6309,7 +6309,7 @@ pub unsafe fn vqrshrund_n_s64<const N: i32>(a: i64) -> u32 {
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vqrshrun_high_n_s16<const N: i32>(a: uint8x8_t, b: int16x8_t) -> uint8x16_t {
    static_assert!(N : i32 where N >= 1 && N <= 8);
-    simd_shuffle16(a, vqrshrun_n_s16::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+    simd_shuffle16!(a, vqrshrun_n_s16::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
 }

 /// Signed saturating rounded shift right unsigned narrow
@@ -6319,7 +6319,7 @@ pub unsafe fn vqrshrun_high_n_s16<const N: i32>(a: uint8x8_t, b: int16x8_t) -> u
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vqrshrun_high_n_s32<const N: i32>(a: uint16x4_t, b: int32x4_t) -> uint16x8_t {
    static_assert!(N : i32 where N >= 1 && N <= 16);
-    simd_shuffle8(a, vqrshrun_n_s32::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7])
+    simd_shuffle8!(a, vqrshrun_n_s32::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7])
 }

 /// Signed saturating rounded shift right unsigned narrow
@@ -6329,7 +6329,7 @@ pub unsafe fn vqrshrun_high_n_s32<const N: i32>(a: uint16x4_t, b: int32x4_t) ->
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vqrshrun_high_n_s64<const N: i32>(a: uint32x2_t, b: int64x2_t) -> uint32x4_t {
    static_assert!(N : i32 where N >= 1 && N <= 32);
-    simd_shuffle4(a, vqrshrun_n_s64::<N>(b), [0, 1, 2, 3])
+    simd_shuffle4!(a, vqrshrun_n_s64::<N>(b), [0, 1, 2, 3])
 }

 /// Signed saturating shift left
@@ -6521,7 +6521,7 @@ pub unsafe fn vqshrnd_n_s64<const N: i32>(a: i64) -> i32 {
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vqshrn_high_n_s16<const N: i32>(a: int8x8_t, b: int16x8_t) -> int8x16_t {
    static_assert!(N : i32 where N >= 1 && N <= 8);
-    simd_shuffle16(a, vqshrn_n_s16::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+    simd_shuffle16!(a, vqshrn_n_s16::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
 }

 /// Signed saturating shift right narrow
@@ -6531,7 +6531,7 @@ pub unsafe fn vqshrn_high_n_s16<const N: i32>(a: int8x8_t, b: int16x8_t) -> int8
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vqshrn_high_n_s32<const N: i32>(a: int16x4_t, b: int32x4_t) -> int16x8_t {
    static_assert!(N : i32 where N >= 1 && N <= 16);
-    simd_shuffle8(a, vqshrn_n_s32::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7])
+    simd_shuffle8!(a, vqshrn_n_s32::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7])
 }

 /// Signed saturating shift right narrow
@@ -6541,7 +6541,7 @@ pub unsafe fn vqshrn_high_n_s32<const N: i32>(a: int16x4_t, b: int32x4_t) -> int
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vqshrn_high_n_s64<const N: i32>(a: int32x2_t, b: int64x2_t) -> int32x4_t {
    static_assert!(N : i32 where N >= 1 && N <= 32);
-    simd_shuffle4(a, vqshrn_n_s64::<N>(b), [0, 1, 2, 3])
+    simd_shuffle4!(a, vqshrn_n_s64::<N>(b), [0, 1, 2, 3])
 }

 /// Unsigned saturating shift right narrow
@@ -6581,7 +6581,7 @@ pub unsafe fn vqshrnd_n_u64<const N: i32>(a: u64) -> u32 {
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vqshrn_high_n_u16<const N: i32>(a: uint8x8_t, b: uint16x8_t) -> uint8x16_t {
    static_assert!(N : i32 where N >= 1 && N <= 8);
-    simd_shuffle16(a, vqshrn_n_u16::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+    simd_shuffle16!(a, vqshrn_n_u16::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
 }

 /// Unsigned saturating shift right narrow
@@ -6591,7 +6591,7 @@ pub unsafe fn vqshrn_high_n_u16<const N: i32>(a: uint8x8_t, b: uint16x8_t) -> ui
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vqshrn_high_n_u32<const N: i32>(a: uint16x4_t, b: uint32x4_t) -> uint16x8_t {
    static_assert!(N : i32 where N >= 1 && N <= 16);
-    simd_shuffle8(a, vqshrn_n_u32::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7])
+    simd_shuffle8!(a, vqshrn_n_u32::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7])
 }

 /// Unsigned saturating shift right narrow
@@ -6601,7 +6601,7 @@ pub unsafe fn vqshrn_high_n_u32<const N: i32>(a: uint16x4_t, b: uint32x4_t) -> u
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vqshrn_high_n_u64<const N: i32>(a: uint32x2_t, b: uint64x2_t) -> uint32x4_t {
    static_assert!(N : i32 where N >= 1 && N <= 32);
-    simd_shuffle4(a, vqshrn_n_u64::<N>(b), [0, 1, 2, 3])
+    simd_shuffle4!(a, vqshrn_n_u64::<N>(b), [0, 1, 2, 3])
 }

 /// Signed saturating shift right unsigned narrow
@@ -6641,7 +6641,7 @@ pub unsafe fn vqshrund_n_s64<const N: i32>(a: i64) -> u32 {
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vqshrun_high_n_s16<const N: i32>(a: uint8x8_t, b: int16x8_t) -> uint8x16_t {
    static_assert!(N : i32 where N >= 1 && N <= 8);
-    simd_shuffle16(a, vqshrun_n_s16::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+    simd_shuffle16!(a, vqshrun_n_s16::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
 }

 /// Signed saturating shift right unsigned narrow
@@ -6651,7 +6651,7 @@ pub unsafe fn vqshrun_high_n_s16<const N: i32>(a: uint8x8_t, b: int16x8_t) -> ui
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vqshrun_high_n_s32<const N: i32>(a: uint16x4_t, b: int32x4_t) -> uint16x8_t {
    static_assert!(N : i32 where N >= 1 && N <= 16);
-    simd_shuffle8(a, vqshrun_n_s32::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7])
+    simd_shuffle8!(a, vqshrun_n_s32::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7])
 }

 /// Signed saturating shift right unsigned narrow
@@ -6661,7 +6661,7 @@ pub unsafe fn vqshrun_high_n_s32<const N: i32>(a: uint16x4_t, b: int32x4_t) -> u
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vqshrun_high_n_s64<const N: i32>(a: uint32x2_t, b: int64x2_t) -> uint32x4_t {
    static_assert!(N : i32 where N >= 1 && N <= 32);
-    simd_shuffle4(a, vqshrun_n_s64::<N>(b), [0, 1, 2, 3])
+    simd_shuffle4!(a, vqshrun_n_s64::<N>(b), [0, 1, 2, 3])
 }

 /// Calculates the square root of each lane.
@@ -7527,7 +7527,7 @@ pub unsafe fn vrshrd_n_u64<const N: i32>(a: u64) -> u64 {
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vrshrn_high_n_s16<const N: i32>(a: int8x8_t, b: int16x8_t) -> int8x16_t {
    static_assert!(N : i32 where N >= 1 && N <= 8);
-    simd_shuffle16(a, vrshrn_n_s16::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+    simd_shuffle16!(a, vrshrn_n_s16::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
 }

 /// Rounding shift right narrow
@@ -7537,7 +7537,7 @@ pub unsafe fn vrshrn_high_n_s16<const N: i32>(a: int8x8_t, b: int16x8_t) -> int8
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vrshrn_high_n_s32<const N: i32>(a: int16x4_t, b: int32x4_t) -> int16x8_t {
    static_assert!(N : i32 where N >= 1 && N <= 16);
-    simd_shuffle8(a, vrshrn_n_s32::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7])
+    simd_shuffle8!(a, vrshrn_n_s32::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7])
 }

 /// Rounding shift right narrow
@@ -7547,7 +7547,7 @@ pub unsafe fn vrshrn_high_n_s32<const N: i32>(a: int16x4_t, b: int32x4_t) -> int
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vrshrn_high_n_s64<const N: i32>(a: int32x2_t, b: int64x2_t) -> int32x4_t {
    static_assert!(N : i32 where N >= 1 && N <= 32);
-    simd_shuffle4(a, vrshrn_n_s64::<N>(b), [0, 1, 2, 3])
+    simd_shuffle4!(a, vrshrn_n_s64::<N>(b), [0, 1, 2, 3])
 }

 /// Rounding shift right narrow
@@ -7557,7 +7557,7 @@ pub unsafe fn vrshrn_high_n_s64<const N: i32>(a: int32x2_t, b: int64x2_t) -> int
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vrshrn_high_n_u16<const N: i32>(a: uint8x8_t, b: uint16x8_t) -> uint8x16_t {
    static_assert!(N : i32 where N >= 1 && N <= 8);
-    simd_shuffle16(a, vrshrn_n_u16::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+    simd_shuffle16!(a, vrshrn_n_u16::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
 }

 /// Rounding shift right narrow
@@ -7567,7 +7567,7 @@ pub unsafe fn vrshrn_high_n_u16<const N: i32>(a: uint8x8_t, b: uint16x8_t) -> ui
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vrshrn_high_n_u32<const N: i32>(a: uint16x4_t, b: uint32x4_t) -> uint16x8_t {
    static_assert!(N : i32 where N >= 1 && N <= 16);
-    simd_shuffle8(a, vrshrn_n_u32::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7])
+    simd_shuffle8!(a, vrshrn_n_u32::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7])
 }

 /// Rounding shift right narrow
@@ -7577,7 +7577,7 @@ pub unsafe fn vrshrn_high_n_u32<const N: i32>(a: uint16x4_t, b: uint32x4_t) -> u
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vrshrn_high_n_u64<const N: i32>(a: uint32x2_t, b: uint64x2_t) -> uint32x4_t {
    static_assert!(N : i32 where N >= 1 && N <= 32);
-    simd_shuffle4(a, vrshrn_n_u64::<N>(b), [0, 1, 2, 3])
+    simd_shuffle4!(a, vrshrn_n_u64::<N>(b), [0, 1, 2, 3])
 }

 /// Signed rounding shift right and accumulate.
@@ -7645,7 +7645,7 @@ pub unsafe fn vshld_u64(a: u64, b: i64) -> u64 {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vshll_high_n_s8<const N: i32>(a: int8x16_t) -> int16x8_t {
    static_assert!(N : i32 where N >= 0 && N <= 8);
-    let b: int8x8_t = simd_shuffle8(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let b: int8x8_t = simd_shuffle8!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
    vshll_n_s8::<N>(b)
 }

@@ -7656,7 +7656,7 @@ pub unsafe fn vshll_high_n_s8<const N: i32>(a: int8x16_t) -> int16x8_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vshll_high_n_s16<const N: i32>(a: int16x8_t) -> int32x4_t {
    static_assert!(N : i32 where N >= 0 && N <= 16);
-    let b: int16x4_t = simd_shuffle4(a, a, [4, 5, 6, 7]);
+    let b: int16x4_t = simd_shuffle4!(a, a, [4, 5, 6, 7]);
    vshll_n_s16::<N>(b)
 }

@@ -7667,7 +7667,7 @@ pub unsafe fn vshll_high_n_s16<const N: i32>(a: int16x8_t) -> int32x4_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vshll_high_n_s32<const N: i32>(a: int32x4_t) -> int64x2_t {
    static_assert!(N : i32 where N >= 0 && N <= 32);
-    let b: int32x2_t = simd_shuffle2(a, a, [2, 3]);
+    let b: int32x2_t = simd_shuffle2!(a, a, [2, 3]);
    vshll_n_s32::<N>(b)
 }

@@ -7678,7 +7678,7 @@ pub unsafe fn vshll_high_n_s32<const N: i32>(a: int32x4_t) -> int64x2_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vshll_high_n_u8<const N: i32>(a: uint8x16_t) -> uint16x8_t {
    static_assert!(N : i32 where N >= 0 && N <= 8);
-    let b: uint8x8_t = simd_shuffle8(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let b: uint8x8_t = simd_shuffle8!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
    vshll_n_u8::<N>(b)
 }

@@ -7689,7 +7689,7 @@ pub unsafe fn vshll_high_n_u8<const N: i32>(a: uint8x16_t) -> uint16x8_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vshll_high_n_u16<const N: i32>(a: uint16x8_t) -> uint32x4_t {
    static_assert!(N : i32 where N >= 0 && N <= 16);
-    let b: uint16x4_t = simd_shuffle4(a, a, [4, 5, 6, 7]);
+    let b: uint16x4_t = simd_shuffle4!(a, a, [4, 5, 6, 7]);
    vshll_n_u16::<N>(b)
 }

@@ -7700,7 +7700,7 @@ pub unsafe fn vshll_high_n_u16<const N: i32>(a: uint16x8_t) -> uint32x4_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vshll_high_n_u32<const N: i32>(a: uint32x4_t) -> uint64x2_t {
    static_assert!(N : i32 where N >= 0 && N <= 32);
-    let b: uint32x2_t = simd_shuffle2(a, a, [2, 3]);
+    let b: uint32x2_t = simd_shuffle2!(a, a, [2, 3]);
    vshll_n_u32::<N>(b)
 }

@@ -7711,7 +7711,7 @@ pub unsafe fn vshll_high_n_u32<const N: i32>(a: uint32x4_t) -> uint64x2_t {
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vshrn_high_n_s16<const N: i32>(a: int8x8_t, b: int16x8_t) -> int8x16_t {
    static_assert!(N : i32 where N >= 1 && N <= 8);
-    simd_shuffle16(a, vshrn_n_s16::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+    simd_shuffle16!(a, vshrn_n_s16::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
 }

 /// Shift right narrow
@@ -7721,7 +7721,7 @@ pub unsafe fn vshrn_high_n_s16<const N: i32>(a: int8x8_t, b: int16x8_t) -> int8x
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vshrn_high_n_s32<const N: i32>(a: int16x4_t, b: int32x4_t) -> int16x8_t {
    static_assert!(N : i32 where N >= 1 && N <= 16);
-    simd_shuffle8(a, vshrn_n_s32::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7])
+    simd_shuffle8!(a, vshrn_n_s32::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7])
 }

 /// Shift right narrow
@@ -7731,7 +7731,7 @@ pub unsafe fn vshrn_high_n_s32<const N: i32>(a: int16x4_t, b: int32x4_t) -> int1
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vshrn_high_n_s64<const N: i32>(a: int32x2_t, b: int64x2_t) -> int32x4_t {
    static_assert!(N : i32 where N >= 1 && N <= 32);
-    simd_shuffle4(a, vshrn_n_s64::<N>(b), [0, 1, 2, 3])
+    simd_shuffle4!(a, vshrn_n_s64::<N>(b), [0, 1, 2, 3])
 }

 /// Shift right narrow
@@ -7741,7 +7741,7 @@ pub unsafe fn vshrn_high_n_s64<const N: i32>(a: int32x2_t, b: int64x2_t) -> int3
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vshrn_high_n_u16<const N: i32>(a: uint8x8_t, b: uint16x8_t) -> uint8x16_t {
    static_assert!(N : i32 where N >= 1 && N <= 8);
-    simd_shuffle16(a, vshrn_n_u16::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+    simd_shuffle16!(a, vshrn_n_u16::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
 }

 /// Shift right narrow
@@ -7751,7 +7751,7 @@ pub unsafe fn vshrn_high_n_u16<const N: i32>(a: uint8x8_t, b: uint16x8_t) -> uin
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vshrn_high_n_u32<const N: i32>(a: uint16x4_t, b: uint32x4_t) -> uint16x8_t {
    static_assert!(N : i32 where N >= 1 && N <= 16);
-    simd_shuffle8(a, vshrn_n_u32::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7])
+    simd_shuffle8!(a, vshrn_n_u32::<N>(b), [0, 1, 2, 3, 4, 5, 6, 7])
 }

 /// Shift right narrow
@@ -7761,7 +7761,7 @@ pub unsafe fn vshrn_high_n_u32<const N: i32>(a: uint16x4_t, b: uint32x4_t) -> ui
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vshrn_high_n_u64<const N: i32>(a: uint32x2_t, b: uint64x2_t) -> uint32x4_t {
    static_assert!(N : i32 where N >= 1 && N <= 32);
-    simd_shuffle4(a, vshrn_n_u64::<N>(b), [0, 1, 2, 3])
+    simd_shuffle4!(a, vshrn_n_u64::<N>(b), [0, 1, 2, 3])
 }

 /// Transpose vectors
@@ -7769,7 +7769,7 @@ pub unsafe fn vshrn_high_n_u64<const N: i32>(a: uint32x2_t, b: uint64x2_t) -> ui
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(trn1))]
 pub unsafe fn vtrn1_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
-    simd_shuffle8(a, b, [0, 8, 2, 10, 4, 12, 6, 14])
+    simd_shuffle8!(a, b, [0, 8, 2, 10, 4, 12, 6, 14])
 }

 /// Transpose vectors
@@ -7777,7 +7777,7 @@ pub unsafe fn vtrn1_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(trn1))]
 pub unsafe fn vtrn1q_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
-    simd_shuffle16(a, b, [0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30])
+    simd_shuffle16!(a, b, [0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30])
 }

 /// Transpose vectors
@@ -7785,7 +7785,7 @@ pub unsafe fn vtrn1q_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(trn1))]
 pub unsafe fn vtrn1_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
-    simd_shuffle4(a, b, [0, 4, 2, 6])
+    simd_shuffle4!(a, b, [0, 4, 2, 6])
 }

 /// Transpose vectors
@@ -7793,7 +7793,7 @@ pub unsafe fn vtrn1_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(trn1))]
 pub unsafe fn vtrn1q_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
-    simd_shuffle8(a, b, [0, 8, 2, 10, 4, 12, 6, 14])
+    simd_shuffle8!(a, b, [0, 8, 2, 10, 4, 12, 6, 14])
 }

 /// Transpose vectors
@@ -7801,7 +7801,7 @@ pub unsafe fn vtrn1q_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(trn1))]
 pub unsafe fn vtrn1q_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
-    simd_shuffle4(a, b, [0, 4, 2, 6])
+    simd_shuffle4!(a, b, [0, 4, 2, 6])
 }

 /// Transpose vectors
@@ -7809,7 +7809,7 @@ pub unsafe fn vtrn1q_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(trn1))]
 pub unsafe fn vtrn1_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
-    simd_shuffle8(a, b, [0, 8, 2, 10, 4, 12, 6, 14])
+    simd_shuffle8!(a, b, [0, 8, 2, 10, 4, 12, 6, 14])
 }

 /// Transpose vectors
@@ -7817,7 +7817,7 @@ pub unsafe fn vtrn1_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(trn1))]
 pub unsafe fn vtrn1q_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
-    simd_shuffle16(a, b, [0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30])
+    simd_shuffle16!(a, b, [0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30])
 }

 /// Transpose vectors
@@ -7825,7 +7825,7 @@ pub unsafe fn vtrn1q_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(trn1))]
 pub unsafe fn vtrn1_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
-    simd_shuffle4(a, b, [0, 4, 2, 6])
+    simd_shuffle4!(a, b, [0, 4, 2, 6])
 }

 /// Transpose vectors
@@ -7833,7 +7833,7 @@ pub unsafe fn vtrn1_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(trn1))]
 pub unsafe fn vtrn1q_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
-    simd_shuffle8(a, b, [0, 8, 2, 10, 4, 12, 6, 14])
+    simd_shuffle8!(a, b, [0, 8, 2, 10, 4, 12, 6, 14])
 }

 /// Transpose vectors
@@ -7841,7 +7841,7 @@ pub unsafe fn vtrn1q_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(trn1))]
 pub unsafe fn vtrn1q_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
-    simd_shuffle4(a, b, [0, 4, 2, 6])
+    simd_shuffle4!(a, b, [0, 4, 2, 6])
 }

 /// Transpose vectors
@@ -7849,7 +7849,7 @@ pub unsafe fn vtrn1q_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(trn1))]
 pub unsafe fn vtrn1_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
-    simd_shuffle8(a, b, [0, 8, 2, 10, 4, 12, 6, 14])
+    simd_shuffle8!(a, b, [0, 8, 2, 10, 4, 12, 6, 14])
 }

 /// Transpose vectors
@@ -7857,7 +7857,7 @@ pub unsafe fn vtrn1_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(trn1))]
 pub unsafe fn vtrn1q_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
-    simd_shuffle16(a, b, [0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30])
+    simd_shuffle16!(a, b, [0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30])
 }

 /// Transpose vectors
@@ -7865,7 +7865,7 @@ pub unsafe fn vtrn1q_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(trn1))]
 pub unsafe fn vtrn1_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t {
-    simd_shuffle4(a, b, [0, 4, 2, 6])
+    simd_shuffle4!(a, b, [0, 4, 2, 6])
 }

 /// Transpose vectors
@@ -7873,7 +7873,7 @@ pub unsafe fn vtrn1_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(trn1))]
 pub unsafe fn vtrn1q_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t {
-    simd_shuffle8(a, b, [0, 8, 2, 10, 4, 12, 6, 14])
+    simd_shuffle8!(a, b, [0, 8, 2, 10, 4, 12, 6, 14])
 }

 /// Transpose vectors
@@ -7881,7 +7881,7 @@ pub unsafe fn vtrn1q_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip1))]
 pub unsafe fn vtrn1_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
-    simd_shuffle2(a, b, [0, 2])
+    simd_shuffle2!(a, b, [0, 2])
 }

 /// Transpose vectors
@@ -7889,7 +7889,7 @@ pub unsafe fn vtrn1_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip1))]
 pub unsafe fn vtrn1q_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
-    simd_shuffle2(a, b, [0, 2])
+    simd_shuffle2!(a, b, [0, 2])
 }

 /// Transpose vectors
@@ -7897,7 +7897,7 @@ pub unsafe fn vtrn1q_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip1))]
 pub unsafe fn vtrn1_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
-    simd_shuffle2(a, b, [0, 2])
+    simd_shuffle2!(a, b, [0, 2])
 }

 /// Transpose vectors
@@ -7905,7 +7905,7 @@ pub unsafe fn vtrn1_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip1))]
 pub unsafe fn vtrn1q_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
-    simd_shuffle2(a, b, [0, 2])
+    simd_shuffle2!(a, b, [0, 2])
 }

 /// Transpose vectors
@@ -7913,7 +7913,7 @@ pub unsafe fn vtrn1q_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip1))]
 pub unsafe fn vtrn1q_p64(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t {
-    simd_shuffle2(a, b, [0, 2])
+    simd_shuffle2!(a, b, [0, 2])
 }

 /// Transpose vectors
@@ -7921,7 +7921,7 @@ pub unsafe fn vtrn1q_p64(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(trn1))]
 pub unsafe fn vtrn1q_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
-    simd_shuffle4(a, b, [0, 4, 2, 6])
+    simd_shuffle4!(a, b, [0, 4, 2, 6])
 }

 /// Transpose vectors
@@ -7929,7 +7929,7 @@ pub unsafe fn vtrn1q_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip1))]
 pub unsafe fn vtrn1_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
-    simd_shuffle2(a, b, [0, 2])
+    simd_shuffle2!(a, b, [0, 2])
 }

 /// Transpose vectors
@@ -7937,7 +7937,7 @@ pub unsafe fn vtrn1_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip1))]
 pub unsafe fn vtrn1q_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
-    simd_shuffle2(a, b, [0, 2])
+    simd_shuffle2!(a, b, [0, 2])
 }

 /// Transpose vectors
@@ -7945,7 +7945,7 @@ pub unsafe fn vtrn1q_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(trn2))]
 pub unsafe fn vtrn2_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
-    simd_shuffle8(a, b, [1, 9, 3, 11, 5, 13, 7, 15])
+    simd_shuffle8!(a, b, [1, 9, 3, 11, 5, 13, 7, 15])
 }

 /// Transpose vectors
@@ -7953,7 +7953,7 @@ pub unsafe fn vtrn2_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(trn2))]
 pub unsafe fn vtrn2q_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
-    simd_shuffle16(a, b, [1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31])
+    simd_shuffle16!(a, b, [1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31])
 }

 /// Transpose vectors
@@ -7961,7 +7961,7 @@ pub unsafe fn vtrn2q_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(trn2))]
 pub unsafe fn vtrn2_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
-    simd_shuffle4(a, b, [1, 5, 3, 7])
+    simd_shuffle4!(a, b, [1, 5, 3, 7])
 }

 /// Transpose vectors
@@ -7969,7 +7969,7 @@ pub unsafe fn vtrn2_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(trn2))]
 pub unsafe fn vtrn2q_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
-    simd_shuffle8(a, b, [1, 9, 3, 11, 5, 13, 7, 15])
+    simd_shuffle8!(a, b, [1, 9, 3, 11, 5, 13, 7, 15])
 }

 /// Transpose vectors
@@ -7977,7 +7977,7 @@ pub unsafe fn vtrn2q_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(trn2))]
 pub unsafe fn vtrn2q_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
-    simd_shuffle4(a, b, [1, 5, 3, 7])
+    simd_shuffle4!(a, b, [1, 5, 3, 7])
 }

 /// Transpose vectors
@@ -7985,7 +7985,7 @@ pub unsafe fn vtrn2q_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(trn2))]
 pub unsafe fn vtrn2_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
-    simd_shuffle8(a, b, [1, 9, 3, 11, 5, 13, 7, 15])
+    simd_shuffle8!(a, b, [1, 9, 3, 11, 5, 13, 7, 15])
 }

 /// Transpose vectors
@@ -7993,7 +7993,7 @@ pub unsafe fn vtrn2_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(trn2))]
 pub unsafe fn vtrn2q_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
-    simd_shuffle16(a, b, [1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31])
+    simd_shuffle16!(a, b, [1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31])
 }

 /// Transpose vectors
@@ -8001,7 +8001,7 @@ pub unsafe fn vtrn2q_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(trn2))]
 pub unsafe fn vtrn2_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
-    simd_shuffle4(a, b, [1, 5, 3, 7])
+    simd_shuffle4!(a, b, [1, 5, 3, 7])
 }

 /// Transpose vectors
@@ -8009,7 +8009,7 @@ pub unsafe fn vtrn2_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(trn2))]
 pub unsafe fn vtrn2q_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
-    simd_shuffle8(a, b, [1, 9, 3, 11, 5, 13, 7, 15])
+    simd_shuffle8!(a, b, [1, 9, 3, 11, 5, 13, 7, 15])
 }

 /// Transpose vectors
@@ -8017,7 +8017,7 @@ pub unsafe fn vtrn2q_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(trn2))]
 pub unsafe fn vtrn2q_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
-    simd_shuffle4(a, b, [1, 5, 3, 7])
+    simd_shuffle4!(a, b, [1, 5, 3, 7])
 }

 /// Transpose vectors
@@ -8025,7 +8025,7 @@ pub unsafe fn vtrn2q_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(trn2))]
 pub unsafe fn vtrn2_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
-    simd_shuffle8(a, b, [1, 9, 3, 11, 5, 13, 7, 15])
+    simd_shuffle8!(a, b, [1, 9, 3, 11, 5, 13, 7, 15])
 }

 /// Transpose vectors
@@ -8033,7 +8033,7 @@ pub unsafe fn vtrn2_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(trn2))]
 pub unsafe fn vtrn2q_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
-    simd_shuffle16(a, b, [1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31])
+    simd_shuffle16!(a, b, [1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31])
 }

 /// Transpose vectors
@@ -8041,7 +8041,7 @@ pub unsafe fn vtrn2q_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(trn2))]
 pub unsafe fn vtrn2_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t {
-    simd_shuffle4(a, b, [1, 5, 3, 7])
+    simd_shuffle4!(a, b, [1, 5, 3, 7])
 }

 /// Transpose vectors
@@ -8049,7 +8049,7 @@ pub unsafe fn vtrn2_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(trn2))]
 pub unsafe fn vtrn2q_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t {
-    simd_shuffle8(a, b, [1, 9, 3, 11, 5, 13, 7, 15])
+    simd_shuffle8!(a, b, [1, 9, 3, 11, 5, 13, 7, 15])
 }

 /// Transpose vectors
@@ -8057,7 +8057,7 @@ pub unsafe fn vtrn2q_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip2))]
 pub unsafe fn vtrn2_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
-    simd_shuffle2(a, b, [1, 3])
+    simd_shuffle2!(a, b, [1, 3])
 }

 /// Transpose vectors
@@ -8065,7 +8065,7 @@ pub unsafe fn vtrn2_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip2))]
 pub unsafe fn vtrn2q_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
-    simd_shuffle2(a, b, [1, 3])
+    simd_shuffle2!(a, b, [1, 3])
 }

 /// Transpose vectors
@@ -8073,7 +8073,7 @@ pub unsafe fn vtrn2q_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip2))]
 pub unsafe fn vtrn2_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
-    simd_shuffle2(a, b, [1, 3])
+    simd_shuffle2!(a, b, [1, 3])
 }

 /// Transpose vectors
@@ -8081,7 +8081,7 @@ pub unsafe fn vtrn2_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip2))]
 pub unsafe fn vtrn2q_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
-    simd_shuffle2(a, b, [1, 3])
+    simd_shuffle2!(a, b, [1, 3])
 }

 /// Transpose vectors
@@ -8089,7 +8089,7 @@ pub unsafe fn vtrn2q_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip2))]
 pub unsafe fn vtrn2q_p64(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t {
-    simd_shuffle2(a, b, [1, 3])
+    simd_shuffle2!(a, b, [1, 3])
 }

 /// Transpose vectors
@@ -8097,7 +8097,7 @@ pub unsafe fn vtrn2q_p64(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(trn2))]
 pub unsafe fn vtrn2q_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
-    simd_shuffle4(a, b, [1, 5, 3, 7])
+    simd_shuffle4!(a, b, [1, 5, 3, 7])
 }

 /// Transpose vectors
@@ -8105,7 +8105,7 @@ pub unsafe fn vtrn2q_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip2))]
 pub unsafe fn vtrn2_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
-    simd_shuffle2(a, b, [1, 3])
+    simd_shuffle2!(a, b, [1, 3])
 }

 /// Transpose vectors
@@ -8113,7 +8113,7 @@ pub unsafe fn vtrn2_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip2))]
 pub unsafe fn vtrn2q_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
-    simd_shuffle2(a, b, [1, 3])
+    simd_shuffle2!(a, b, [1, 3])
 }

 /// Zip vectors
@@ -8121,7 +8121,7 @@ pub unsafe fn vtrn2q_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip1))]
 pub unsafe fn vzip1_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
-    simd_shuffle8(a, b, [0, 8, 1, 9, 2, 10, 3, 11])
+    simd_shuffle8!(a, b, [0, 8, 1, 9, 2, 10, 3, 11])
 }

 /// Zip vectors
@@ -8129,7 +8129,7 @@ pub unsafe fn vzip1_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip1))]
 pub unsafe fn vzip1q_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
-    simd_shuffle16(a, b, [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23])
+    simd_shuffle16!(a, b, [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23])
 }

 /// Zip vectors
@@ -8137,7 +8137,7 @@ pub unsafe fn vzip1q_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip1))]
 pub unsafe fn vzip1_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
-    simd_shuffle4(a, b, [0, 4, 1, 5])
+    simd_shuffle4!(a, b, [0, 4, 1, 5])
 }

 /// Zip vectors
@@ -8145,7 +8145,7 @@ pub unsafe fn vzip1_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip1))]
 pub unsafe fn vzip1q_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
-    simd_shuffle8(a, b, [0, 8, 1, 9, 2, 10, 3, 11])
+    simd_shuffle8!(a, b, [0, 8, 1, 9, 2, 10, 3, 11])
 }

 /// Zip vectors
@@ -8153,7 +8153,7 @@ pub unsafe fn vzip1q_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip1))]
 pub unsafe fn vzip1_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
-    simd_shuffle2(a, b, [0, 2])
+    simd_shuffle2!(a, b, [0, 2])
 }

 /// Zip vectors
@@ -8161,7 +8161,7 @@ pub unsafe fn vzip1_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip1))]
 pub unsafe fn vzip1q_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
-    simd_shuffle4(a, b, [0, 4, 1, 5])
+    simd_shuffle4!(a, b, [0, 4, 1, 5])
 }

 /// Zip vectors
@@ -8169,7 +8169,7 @@ pub unsafe fn vzip1q_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip1))]
 pub unsafe fn vzip1q_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
-    simd_shuffle2(a, b, [0, 2])
+    simd_shuffle2!(a, b, [0, 2])
 }

 /// Zip vectors
@@ -8177,7 +8177,7 @@ pub unsafe fn vzip1q_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip1))]
 pub unsafe fn vzip1_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
-    simd_shuffle8(a, b, [0, 8, 1, 9, 2, 10, 3, 11])
+    simd_shuffle8!(a, b, [0, 8, 1, 9, 2, 10, 3, 11])
 }

 /// Zip vectors
@@ -8185,7 +8185,7 @@ pub unsafe fn vzip1_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip1))]
 pub unsafe fn vzip1q_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
-    simd_shuffle16(a, b, [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23])
+    simd_shuffle16!(a, b, [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23])
 }

 /// Zip vectors
@@ -8193,7 +8193,7 @@ pub unsafe fn vzip1q_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip1))]
 pub unsafe fn vzip1_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
-    simd_shuffle4(a, b, [0, 4, 1, 5])
+    simd_shuffle4!(a, b, [0, 4, 1, 5])
 }

 /// Zip vectors
@@ -8201,7 +8201,7 @@ pub unsafe fn vzip1_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip1))]
 pub unsafe fn vzip1q_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
-    simd_shuffle8(a, b, [0, 8, 1, 9, 2, 10, 3, 11])
+    simd_shuffle8!(a, b, [0, 8, 1, 9, 2, 10, 3, 11])
 }

 /// Zip vectors
@@ -8209,7 +8209,7 @@ pub unsafe fn vzip1q_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip1))]
 pub unsafe fn vzip1_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
-    simd_shuffle2(a, b, [0, 2])
+    simd_shuffle2!(a, b, [0, 2])
 }

 /// Zip vectors
@@ -8217,7 +8217,7 @@ pub unsafe fn vzip1_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip1))]
 pub unsafe fn vzip1q_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
-    simd_shuffle4(a, b, [0, 4, 1, 5])
+    simd_shuffle4!(a, b, [0, 4, 1, 5])
 }

 /// Zip vectors
@@ -8225,7 +8225,7 @@ pub unsafe fn vzip1q_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip1))]
 pub unsafe fn vzip1q_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
-    simd_shuffle2(a, b, [0, 2])
+    simd_shuffle2!(a, b, [0, 2])
 }

 /// Zip vectors
@@ -8233,7 +8233,7 @@ pub unsafe fn vzip1q_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip1))]
 pub unsafe fn vzip1_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
-    simd_shuffle8(a, b, [0, 8, 1, 9, 2, 10, 3, 11])
+    simd_shuffle8!(a, b, [0, 8, 1, 9, 2, 10, 3, 11])
 }

 /// Zip vectors
@@ -8241,7 +8241,7 @@ pub unsafe fn vzip1_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip1))]
 pub unsafe fn vzip1q_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
-    simd_shuffle16(a, b, [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23])
+    simd_shuffle16!(a, b, [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23])
 }

 /// Zip vectors
@@ -8249,7 +8249,7 @@ pub unsafe fn vzip1q_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip1))]
 pub unsafe fn vzip1_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t {
-    simd_shuffle4(a, b, [0, 4, 1, 5])
+    simd_shuffle4!(a, b, [0, 4, 1, 5])
 }

 /// Zip vectors
@@ -8257,7 +8257,7 @@ pub unsafe fn vzip1_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip1))]
 pub unsafe fn vzip1q_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t {
-    simd_shuffle8(a, b, [0, 8, 1, 9, 2, 10, 3, 11])
+    simd_shuffle8!(a, b, [0, 8, 1, 9, 2, 10, 3, 11])
 }

 /// Zip vectors
@@ -8265,7 +8265,7 @@ pub unsafe fn vzip1q_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip1))]
 pub unsafe fn vzip1q_p64(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t {
-    simd_shuffle2(a, b, [0, 2])
+    simd_shuffle2!(a, b, [0, 2])
 }

 /// Zip vectors
@@ -8273,7 +8273,7 @@ pub unsafe fn vzip1q_p64(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip1))]
 pub unsafe fn vzip1_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
-    simd_shuffle2(a, b, [0, 2])
+    simd_shuffle2!(a, b, [0, 2])
 }

 /// Zip vectors
@@ -8281,7 +8281,7 @@ pub unsafe fn vzip1_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip1))]
 pub unsafe fn vzip1q_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
-    simd_shuffle4(a, b, [0, 4, 1, 5])
+    simd_shuffle4!(a, b, [0, 4, 1, 5])
 }

 /// Zip vectors
@@ -8289,7 +8289,7 @@ pub unsafe fn vzip1q_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip1))]
 pub unsafe fn vzip1q_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
-    simd_shuffle2(a, b, [0, 2])
+    simd_shuffle2!(a, b, [0, 2])
 }

 /// Zip vectors
@@ -8297,7 +8297,7 @@ pub unsafe fn vzip1q_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip2))]
 pub unsafe fn vzip2_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
-    simd_shuffle8(a, b, [4, 12, 5, 13, 6, 14, 7, 15])
+    simd_shuffle8!(a, b, [4, 12, 5, 13, 6, 14, 7, 15])
 }

 /// Zip vectors
@@ -8305,7 +8305,7 @@ pub unsafe fn vzip2_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip2))]
 pub unsafe fn vzip2q_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
-    simd_shuffle16(a, b, [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31])
+    simd_shuffle16!(a, b, [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31])
 }

 /// Zip vectors
@@ -8313,7 +8313,7 @@ pub unsafe fn vzip2q_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip2))]
 pub unsafe fn vzip2_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
-    simd_shuffle4(a, b, [2, 6, 3, 7])
+    simd_shuffle4!(a, b, [2, 6, 3, 7])
 }

 /// Zip vectors
@@ -8321,7 +8321,7 @@ pub unsafe fn vzip2_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip2))]
 pub unsafe fn vzip2q_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
-    simd_shuffle8(a, b, [4, 12, 5, 13, 6, 14, 7, 15])
+    simd_shuffle8!(a, b, [4, 12, 5, 13, 6, 14, 7, 15])
 }

 /// Zip vectors
@@ -8329,7 +8329,7 @@ pub unsafe fn vzip2q_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip2))]
 pub unsafe fn vzip2_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
-    simd_shuffle2(a, b, [1, 3])
+    simd_shuffle2!(a, b, [1, 3])
 }

 /// Zip vectors
@@ -8337,7 +8337,7 @@ pub unsafe fn vzip2_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip2))]
 pub unsafe fn vzip2q_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
-    simd_shuffle4(a, b, [2, 6, 3, 7])
+    simd_shuffle4!(a, b, [2, 6, 3, 7])
 }

 /// Zip vectors
@@ -8345,7 +8345,7 @@ pub unsafe fn vzip2q_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip2))]
 pub unsafe fn vzip2q_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
-    simd_shuffle2(a, b, [1, 3])
+    simd_shuffle2!(a, b, [1, 3])
 }

 /// Zip vectors
@@ -8353,7 +8353,7 @@ pub unsafe fn vzip2q_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip2))]
 pub unsafe fn vzip2_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
-    simd_shuffle8(a, b, [4, 12, 5, 13, 6, 14, 7, 15])
+    simd_shuffle8!(a, b, [4, 12, 5, 13, 6, 14, 7, 15])
 }

 /// Zip vectors
@@ -8361,7 +8361,7 @@ pub unsafe fn vzip2_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip2))]
 pub unsafe fn vzip2q_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
-    simd_shuffle16(a, b, [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31])
+    simd_shuffle16!(a, b, [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31])
 }

 /// Zip vectors
@@ -8369,7 +8369,7 @@ pub unsafe fn vzip2q_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip2))]
 pub unsafe fn vzip2_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
-    simd_shuffle4(a, b, [2, 6, 3, 7])
+    simd_shuffle4!(a, b, [2, 6, 3, 7])
 }

 /// Zip vectors
@@ -8377,7 +8377,7 @@ pub unsafe fn vzip2_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip2))]
 pub unsafe fn vzip2q_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
-    simd_shuffle8(a, b, [4, 12, 5, 13, 6, 14, 7, 15])
+    simd_shuffle8!(a, b, [4, 12, 5, 13, 6, 14, 7, 15])
 }

 /// Zip vectors
@@ -8385,7 +8385,7 @@ pub unsafe fn vzip2q_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip2))]
 pub unsafe fn vzip2_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
-    simd_shuffle2(a, b, [1, 3])
+    simd_shuffle2!(a, b, [1, 3])
 }

 /// Zip vectors
@@ -8393,7 +8393,7 @@ pub unsafe fn vzip2_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip2))]
 pub unsafe fn vzip2q_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
-    simd_shuffle4(a, b, [2, 6, 3, 7])
+    simd_shuffle4!(a, b, [2, 6, 3, 7])
 }

 /// Zip vectors
@@ -8401,7 +8401,7 @@ pub unsafe fn vzip2q_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip2))]
 pub unsafe fn vzip2q_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
-    simd_shuffle2(a, b, [1, 3])
+    simd_shuffle2!(a, b, [1, 3])
 }

 /// Zip vectors
@@ -8409,7 +8409,7 @@ pub unsafe fn vzip2q_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip2))]
 pub unsafe fn vzip2_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
-    simd_shuffle8(a, b, [4, 12, 5, 13, 6, 14, 7, 15])
+    simd_shuffle8!(a, b, [4, 12, 5, 13, 6, 14, 7, 15])
 }

 /// Zip vectors
@@ -8417,7 +8417,7 @@ pub unsafe fn vzip2_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip2))]
 pub unsafe fn vzip2q_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
-    simd_shuffle16(a, b, [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31])
+    simd_shuffle16!(a, b, [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31])
 }

 /// Zip vectors
@@ -8425,7 +8425,7 @@ pub unsafe fn vzip2q_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip2))]
 pub unsafe fn vzip2_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t {
-    simd_shuffle4(a, b, [2, 6, 3, 7])
+    simd_shuffle4!(a, b, [2, 6, 3, 7])
 }

 /// Zip vectors
@@ -8433,7 +8433,7 @@ pub unsafe fn vzip2_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip2))]
 pub unsafe fn vzip2q_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t {
-    simd_shuffle8(a, b, [4, 12, 5, 13, 6, 14, 7, 15])
+    simd_shuffle8!(a, b, [4, 12, 5, 13, 6, 14, 7, 15])
 }

 /// Zip vectors
@@ -8441,7 +8441,7 @@ pub unsafe fn vzip2q_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip2))]
 pub unsafe fn vzip2q_p64(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t {
-    simd_shuffle2(a, b, [1, 3])
+    simd_shuffle2!(a, b, [1, 3])
 }

 /// Zip vectors
@@ -8449,7 +8449,7 @@ pub unsafe fn vzip2q_p64(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip2))]
 pub unsafe fn vzip2_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
-    simd_shuffle2(a, b, [1, 3])
+    simd_shuffle2!(a, b, [1, 3])
 }

 /// Zip vectors
@@ -8457,7 +8457,7 @@ pub unsafe fn vzip2_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip2))]
 pub unsafe fn vzip2q_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
-    simd_shuffle4(a, b, [2, 6, 3, 7])
+    simd_shuffle4!(a, b, [2, 6, 3, 7])
 }

 /// Zip vectors
@@ -8465,7 +8465,7 @@ pub unsafe fn vzip2q_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip2))]
 pub unsafe fn vzip2q_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
-    simd_shuffle2(a, b, [1, 3])
+    simd_shuffle2!(a, b, [1, 3])
 }

 /// Unzip vectors
@@ -8473,7 +8473,7 @@ pub unsafe fn vzip2q_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uzp1))]
 pub unsafe fn vuzp1_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
-    simd_shuffle8(a, b, [0, 2, 4, 6, 8, 10, 12, 14])
+    simd_shuffle8!(a, b, [0, 2, 4, 6, 8, 10, 12, 14])
 }

 /// Unzip vectors
@@ -8481,7 +8481,7 @@ pub unsafe fn vuzp1_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uzp1))]
 pub unsafe fn vuzp1q_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
-    simd_shuffle16(a, b, [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30])
+    simd_shuffle16!(a, b, [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30])
 }

 /// Unzip vectors
@@ -8489,7 +8489,7 @@ pub unsafe fn vuzp1q_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uzp1))]
 pub unsafe fn vuzp1_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
-    simd_shuffle4(a, b, [0, 2, 4, 6])
+    simd_shuffle4!(a, b, [0, 2, 4, 6])
 }

 /// Unzip vectors
@@ -8497,7 +8497,7 @@ pub unsafe fn vuzp1_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uzp1))]
 pub unsafe fn vuzp1q_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
-    simd_shuffle8(a, b, [0, 2, 4, 6, 8, 10, 12, 14])
+    simd_shuffle8!(a, b, [0, 2, 4, 6, 8, 10, 12, 14])
 }

 /// Unzip vectors
@@ -8505,7 +8505,7 @@ pub unsafe fn vuzp1q_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uzp1))]
 pub unsafe fn vuzp1q_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
-    simd_shuffle4(a, b, [0, 2, 4, 6])
+    simd_shuffle4!(a, b, [0, 2, 4, 6])
 }

 /// Unzip vectors
@@ -8513,7 +8513,7 @@ pub unsafe fn vuzp1q_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uzp1))]
 pub unsafe fn vuzp1_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
-    simd_shuffle8(a, b, [0, 2, 4, 6, 8, 10, 12, 14])
+    simd_shuffle8!(a, b, [0, 2, 4, 6, 8, 10, 12, 14])
 }

 /// Unzip vectors
@@ -8521,7 +8521,7 @@ pub unsafe fn vuzp1_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uzp1))]
 pub unsafe fn vuzp1q_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
-    simd_shuffle16(a, b, [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30])
+    simd_shuffle16!(a, b, [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30])
 }

 /// Unzip vectors
@@ -8529,7 +8529,7 @@ pub unsafe fn vuzp1q_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uzp1))]
 pub unsafe fn vuzp1_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
-    simd_shuffle4(a, b, [0, 2, 4, 6])
+    simd_shuffle4!(a, b, [0, 2, 4, 6])
 }

 /// Unzip vectors
@@ -8537,7 +8537,7 @@ pub unsafe fn vuzp1_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uzp1))]
 pub unsafe fn vuzp1q_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
-    simd_shuffle8(a, b, [0, 2, 4, 6, 8, 10, 12, 14])
+    simd_shuffle8!(a, b, [0, 2, 4, 6, 8, 10, 12, 14])
 }

 /// Unzip vectors
@@ -8545,7 +8545,7 @@ pub unsafe fn vuzp1q_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uzp1))]
 pub unsafe fn vuzp1q_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
-    simd_shuffle4(a, b, [0, 2, 4, 6])
+    simd_shuffle4!(a, b, [0, 2, 4, 6])
 }

 /// Unzip vectors
@@ -8553,7 +8553,7 @@ pub unsafe fn vuzp1q_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uzp1))]
 pub unsafe fn vuzp1_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
-    simd_shuffle8(a, b, [0, 2, 4, 6, 8, 10, 12, 14])
+    simd_shuffle8!(a, b, [0, 2, 4, 6, 8, 10, 12, 14])
 }

 /// Unzip vectors
@@ -8561,7 +8561,7 @@ pub unsafe fn vuzp1_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uzp1))]
 pub unsafe fn vuzp1q_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
-    simd_shuffle16(a, b, [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30])
+    simd_shuffle16!(a, b, [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30])
 }

 /// Unzip vectors
@@ -8569,7 +8569,7 @@ pub unsafe fn vuzp1q_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uzp1))]
 pub unsafe fn vuzp1_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t {
-    simd_shuffle4(a, b, [0, 2, 4, 6])
+    simd_shuffle4!(a, b, [0, 2, 4, 6])
 }

 /// Unzip vectors
@@ -8577,7 +8577,7 @@ pub unsafe fn vuzp1_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uzp1))]
 pub unsafe fn vuzp1q_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t {
-    simd_shuffle8(a, b, [0, 2, 4, 6, 8, 10, 12, 14])
+    simd_shuffle8!(a, b, [0, 2, 4, 6, 8, 10, 12, 14])
 }

 /// Unzip vectors
@@ -8585,7 +8585,7 @@ pub unsafe fn vuzp1q_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip1))]
 pub unsafe fn vuzp1_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
-    simd_shuffle2(a, b, [0, 2])
+    simd_shuffle2!(a, b, [0, 2])
 }

 /// Unzip vectors
@@ -8593,7 +8593,7 @@ pub unsafe fn vuzp1_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip1))]
 pub unsafe fn vuzp1q_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
-    simd_shuffle2(a, b, [0, 2])
+    simd_shuffle2!(a, b, [0, 2])
 }

 /// Unzip vectors
@@ -8601,7 +8601,7 @@ pub unsafe fn vuzp1q_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip1))]
 pub unsafe fn vuzp1_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
-    simd_shuffle2(a, b, [0, 2])
+    simd_shuffle2!(a, b, [0, 2])
 }

 /// Unzip vectors
@@ -8609,7 +8609,7 @@ pub unsafe fn vuzp1_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip1))]
 pub unsafe fn vuzp1q_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
-    simd_shuffle2(a, b, [0, 2])
+    simd_shuffle2!(a, b, [0, 2])
 }

 /// Unzip vectors
@@ -8617,7 +8617,7 @@ pub unsafe fn vuzp1q_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip1))]
 pub unsafe fn vuzp1q_p64(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t {
-    simd_shuffle2(a, b, [0, 2])
+    simd_shuffle2!(a, b, [0, 2])
 }

 /// Unzip vectors
@@ -8625,7 +8625,7 @@ pub unsafe fn vuzp1q_p64(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uzp1))]
 pub unsafe fn vuzp1q_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
-    simd_shuffle4(a, b, [0, 2, 4, 6])
+    simd_shuffle4!(a, b, [0, 2, 4, 6])
 }

 /// Unzip vectors
@@ -8633,7 +8633,7 @@ pub unsafe fn vuzp1q_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip1))]
 pub unsafe fn vuzp1_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
-    simd_shuffle2(a, b, [0, 2])
+    simd_shuffle2!(a, b, [0, 2])
 }

 /// Unzip vectors
@@ -8641,7 +8641,7 @@ pub unsafe fn vuzp1_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip1))]
 pub unsafe fn vuzp1q_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
-    simd_shuffle2(a, b, [0, 2])
+    simd_shuffle2!(a, b, [0, 2])
 }

 /// Unzip vectors
@@ -8649,7 +8649,7 @@ pub unsafe fn vuzp1q_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uzp2))]
 pub unsafe fn vuzp2_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
-    simd_shuffle8(a, b, [1, 3, 5, 7, 9, 11, 13, 15])
+    simd_shuffle8!(a, b, [1, 3, 5, 7, 9, 11, 13, 15])
 }

 /// Unzip vectors
@@ -8657,7 +8657,7 @@ pub unsafe fn vuzp2_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uzp2))]
 pub unsafe fn vuzp2q_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
-    simd_shuffle16(a, b, [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31])
+    simd_shuffle16!(a, b, [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31])
 }

 /// Unzip vectors
@@ -8665,7 +8665,7 @@ pub unsafe fn vuzp2q_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uzp2))]
 pub unsafe fn vuzp2_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
-    simd_shuffle4(a, b, [1, 3, 5, 7])
+    simd_shuffle4!(a, b, [1, 3, 5, 7])
 }

 /// Unzip vectors
@@ -8673,7 +8673,7 @@ pub unsafe fn vuzp2_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uzp2))]
 pub unsafe fn vuzp2q_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
-    simd_shuffle8(a, b, [1, 3, 5, 7, 9, 11, 13, 15])
+    simd_shuffle8!(a, b, [1, 3, 5, 7, 9, 11, 13, 15])
 }

 /// Unzip vectors
@@ -8681,7 +8681,7 @@ pub unsafe fn vuzp2q_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uzp2))]
 pub unsafe fn vuzp2q_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
-    simd_shuffle4(a, b, [1, 3, 5, 7])
+    simd_shuffle4!(a, b, [1, 3, 5, 7])
 }

 /// Unzip vectors
@@ -8689,7 +8689,7 @@ pub unsafe fn vuzp2q_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uzp2))]
 pub unsafe fn vuzp2_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
-    simd_shuffle8(a, b, [1, 3, 5, 7, 9, 11, 13, 15])
+    simd_shuffle8!(a, b, [1, 3, 5, 7, 9, 11, 13, 15])
 }

 /// Unzip vectors
@@ -8697,7 +8697,7 @@ pub unsafe fn vuzp2_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uzp2))]
 pub unsafe fn vuzp2q_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
-    simd_shuffle16(a, b, [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31])
+    simd_shuffle16!(a, b, [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31])
 }

 /// Unzip vectors
@@ -8705,7 +8705,7 @@ pub unsafe fn vuzp2q_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uzp2))]
 pub unsafe fn vuzp2_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
-    simd_shuffle4(a, b, [1, 3, 5, 7])
+    simd_shuffle4!(a, b, [1, 3, 5, 7])
 }

 /// Unzip vectors
@@ -8713,7 +8713,7 @@ pub unsafe fn vuzp2_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uzp2))]
 pub unsafe fn vuzp2q_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
-    simd_shuffle8(a, b, [1, 3, 5, 7, 9, 11, 13, 15])
+    simd_shuffle8!(a, b, [1, 3, 5, 7, 9, 11, 13, 15])
 }

 /// Unzip vectors
@@ -8721,7 +8721,7 @@ pub unsafe fn vuzp2q_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uzp2))]
 pub unsafe fn vuzp2q_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
-    simd_shuffle4(a, b, [1, 3, 5, 7])
+    simd_shuffle4!(a, b, [1, 3, 5, 7])
 }

 /// Unzip vectors
@@ -8729,7 +8729,7 @@ pub unsafe fn vuzp2q_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uzp2))]
 pub unsafe fn vuzp2_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
-    simd_shuffle8(a, b, [1, 3, 5, 7, 9, 11, 13, 15])
+    simd_shuffle8!(a, b, [1, 3, 5, 7, 9, 11, 13, 15])
 }

 /// Unzip vectors
@@ -8737,7 +8737,7 @@ pub unsafe fn vuzp2_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uzp2))]
 pub unsafe fn vuzp2q_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
-    simd_shuffle16(a, b, [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31])
+    simd_shuffle16!(a, b, [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31])
 }

 /// Unzip vectors
@@ -8745,7 +8745,7 @@ pub unsafe fn vuzp2q_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uzp2))]
 pub unsafe fn vuzp2_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t {
-    simd_shuffle4(a, b, [1, 3, 5, 7])
+    simd_shuffle4!(a, b, [1, 3, 5, 7])
 }

 /// Unzip vectors
@@ -8753,7 +8753,7 @@ pub unsafe fn vuzp2_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uzp2))]
 pub unsafe fn vuzp2q_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t {
-    simd_shuffle8(a, b, [1, 3, 5, 7, 9, 11, 13, 15])
+    simd_shuffle8!(a, b, [1, 3, 5, 7, 9, 11, 13, 15])
 }

 /// Unzip vectors
@@ -8761,7 +8761,7 @@ pub unsafe fn vuzp2q_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip2))]
 pub unsafe fn vuzp2_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
-    simd_shuffle2(a, b, [1, 3])
+    simd_shuffle2!(a, b, [1, 3])
 }

 /// Unzip vectors
@@ -8769,7 +8769,7 @@ pub unsafe fn vuzp2_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip2))]
 pub unsafe fn vuzp2q_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
-    simd_shuffle2(a, b, [1, 3])
+    simd_shuffle2!(a, b, [1, 3])
 }

 /// Unzip vectors
@@ -8777,7 +8777,7 @@ pub unsafe fn vuzp2q_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip2))]
 pub unsafe fn vuzp2_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
-    simd_shuffle2(a, b, [1, 3])
+    simd_shuffle2!(a, b, [1, 3])
 }

 /// Unzip vectors
@@ -8785,7 +8785,7 @@ pub unsafe fn vuzp2_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip2))]
 pub unsafe fn vuzp2q_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
-    simd_shuffle2(a, b, [1, 3])
+    simd_shuffle2!(a, b, [1, 3])
 }

 /// Unzip vectors
@@ -8793,7 +8793,7 @@ pub unsafe fn vuzp2q_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip2))]
 pub unsafe fn vuzp2q_p64(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t {
-    simd_shuffle2(a, b, [1, 3])
+    simd_shuffle2!(a, b, [1, 3])
 }

 /// Unzip vectors
@@ -8801,7 +8801,7 @@ pub unsafe fn vuzp2q_p64(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uzp2))]
 pub unsafe fn vuzp2q_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
-    simd_shuffle4(a, b, [1, 3, 5, 7])
+    simd_shuffle4!(a, b, [1, 3, 5, 7])
 }

 /// Unzip vectors
@@ -8809,7 +8809,7 @@ pub unsafe fn vuzp2q_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip2))]
 pub unsafe fn vuzp2_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
-    simd_shuffle2(a, b, [1, 3])
+    simd_shuffle2!(a, b, [1, 3])
 }

 /// Unzip vectors
@@ -8817,7 +8817,7 @@ pub unsafe fn vuzp2_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(zip2))]
 pub unsafe fn vuzp2q_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
-    simd_shuffle2(a, b, [1, 3])
+    simd_shuffle2!(a, b, [1, 3])
 }

 /// Unsigned Absolute difference and Accumulate Long
@@ -8825,8 +8825,8 @@ pub unsafe fn vuzp2q_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uabal))]
 pub unsafe fn vabal_high_u8(a: uint16x8_t, b: uint8x16_t, c: uint8x16_t) -> uint16x8_t {
-    let d: uint8x8_t = simd_shuffle8(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
-    let e: uint8x8_t = simd_shuffle8(c, c, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let d: uint8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let e: uint8x8_t = simd_shuffle8!(c, c, [8, 9, 10, 11, 12, 13, 14, 15]);
    let f: uint8x8_t = vabd_u8(d, e);
    simd_add(a, simd_cast(f))
 }
@@ -8836,8 +8836,8 @@ pub unsafe fn vabal_high_u8(a: uint16x8_t, b: uint8x16_t, c: uint8x16_t) -> uint
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uabal))]
 pub unsafe fn vabal_high_u16(a: uint32x4_t, b: uint16x8_t, c: uint16x8_t) -> uint32x4_t {
-    let d: uint16x4_t = simd_shuffle4(b, b, [4, 5, 6, 7]);
-    let e: uint16x4_t = simd_shuffle4(c, c, [4, 5, 6, 7]);
+    let d: uint16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
+    let e: uint16x4_t = simd_shuffle4!(c, c, [4, 5, 6, 7]);
    let f: uint16x4_t = vabd_u16(d, e);
    simd_add(a, simd_cast(f))
 }
@@ -8847,8 +8847,8 @@ pub unsafe fn vabal_high_u16(a: uint32x4_t, b: uint16x8_t, c: uint16x8_t) -> uin
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uabal))]
 pub unsafe fn vabal_high_u32(a: uint64x2_t, b: uint32x4_t, c: uint32x4_t) -> uint64x2_t {
-    let d: uint32x2_t = simd_shuffle2(b, b, [2, 3]);
-    let e: uint32x2_t = simd_shuffle2(c, c, [2, 3]);
+    let d: uint32x2_t = simd_shuffle2!(b, b, [2, 3]);
+    let e: uint32x2_t = simd_shuffle2!(c, c, [2, 3]);
    let f: uint32x2_t = vabd_u32(d, e);
    simd_add(a, simd_cast(f))
 }
@@ -8858,8 +8858,8 @@ pub unsafe fn vabal_high_u32(a: uint64x2_t, b: uint32x4_t, c: uint32x4_t) -> uin
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(sabal))]
 pub unsafe fn vabal_high_s8(a: int16x8_t, b: int8x16_t, c: int8x16_t) -> int16x8_t {
-    let d: int8x8_t = simd_shuffle8(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
-    let e: int8x8_t = simd_shuffle8(c, c, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let d: int8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let e: int8x8_t = simd_shuffle8!(c, c, [8, 9, 10, 11, 12, 13, 14, 15]);
    let f: int8x8_t = vabd_s8(d, e);
    let f: uint8x8_t = simd_cast(f);
    simd_add(a, simd_cast(f))
@@ -8870,8 +8870,8 @@ pub unsafe fn vabal_high_s8(a: int16x8_t, b: int8x16_t, c: int8x16_t) -> int16x8
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(sabal))]
 pub unsafe fn vabal_high_s16(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t {
-    let d: int16x4_t = simd_shuffle4(b, b, [4, 5, 6, 7]);
-    let e: int16x4_t = simd_shuffle4(c, c, [4, 5, 6, 7]);
+    let d: int16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
+    let e: int16x4_t = simd_shuffle4!(c, c, [4, 5, 6, 7]);
    let f: int16x4_t = vabd_s16(d, e);
    let f: uint16x4_t = simd_cast(f);
    simd_add(a, simd_cast(f))
@@ -8882,8 +8882,8 @@ pub unsafe fn vabal_high_s16(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(sabal))]
 pub unsafe fn vabal_high_s32(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x2_t {
-    let d: int32x2_t = simd_shuffle2(b, b, [2, 3]);
-    let e: int32x2_t = simd_shuffle2(c, c, [2, 3]);
+    let d: int32x2_t = simd_shuffle2!(b, b, [2, 3]);
+    let e: int32x2_t = simd_shuffle2!(c, c, [2, 3]);
    let f: int32x2_t = vabd_s32(d, e);
    let f: uint32x2_t = simd_cast(f);
    simd_add(a, simd_cast(f))
@@ -1595,7 +1595,7 @@ pub unsafe fn vext_f64<const N: i32>(a: float64x1_t, _b: float64x1_t) -> float64
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(mov))]
 pub unsafe fn vcombine_s8(low: int8x8_t, high: int8x8_t) -> int8x16_t {
-    simd_shuffle16(
+    simd_shuffle16!(
        low,
        high,
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
@@ -1607,7 +1607,7 @@ pub unsafe fn vcombine_s8(low: int8x8_t, high: int8x8_t) -> int8x16_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(mov))]
 pub unsafe fn vcombine_s16(low: int16x4_t, high: int16x4_t) -> int16x8_t {
-    simd_shuffle8(low, high, [0, 1, 2, 3, 4, 5, 6, 7])
+    simd_shuffle8!(low, high, [0, 1, 2, 3, 4, 5, 6, 7])
 }

 /// Vector combine
@@ -1615,7 +1615,7 @@ pub unsafe fn vcombine_s16(low: int16x4_t, high: int16x4_t) -> int16x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(mov))]
 pub unsafe fn vcombine_s32(low: int32x2_t, high: int32x2_t) -> int32x4_t {
-    simd_shuffle4(low, high, [0, 1, 2, 3])
+    simd_shuffle4!(low, high, [0, 1, 2, 3])
 }

 /// Vector combine
@@ -1623,7 +1623,7 @@ pub unsafe fn vcombine_s32(low: int32x2_t, high: int32x2_t) -> int32x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(mov))]
 pub unsafe fn vcombine_s64(low: int64x1_t, high: int64x1_t) -> int64x2_t {
-    simd_shuffle2(low, high, [0, 1])
+    simd_shuffle2!(low, high, [0, 1])
 }

 /// Vector combine
@@ -1631,7 +1631,7 @@ pub unsafe fn vcombine_s64(low: int64x1_t, high: int64x1_t) -> int64x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(mov))]
 pub unsafe fn vcombine_u8(low: uint8x8_t, high: uint8x8_t) -> uint8x16_t {
-    simd_shuffle16(
+    simd_shuffle16!(
        low,
        high,
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
@@ -1643,7 +1643,7 @@ pub unsafe fn vcombine_u8(low: uint8x8_t, high: uint8x8_t) -> uint8x16_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(mov))]
 pub unsafe fn vcombine_u16(low: uint16x4_t, high: uint16x4_t) -> uint16x8_t {
-    simd_shuffle8(low, high, [0, 1, 2, 3, 4, 5, 6, 7])
+    simd_shuffle8!(low, high, [0, 1, 2, 3, 4, 5, 6, 7])
 }

 /// Vector combine
@@ -1651,7 +1651,7 @@ pub unsafe fn vcombine_u16(low: uint16x4_t, high: uint16x4_t) -> uint16x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(mov))]
 pub unsafe fn vcombine_u32(low: uint32x2_t, high: uint32x2_t) -> uint32x4_t {
-    simd_shuffle4(low, high, [0, 1, 2, 3])
+    simd_shuffle4!(low, high, [0, 1, 2, 3])
 }

 /// Vector combine
@@ -1659,7 +1659,7 @@ pub unsafe fn vcombine_u32(low: uint32x2_t, high: uint32x2_t) -> uint32x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(mov))]
 pub unsafe fn vcombine_u64(low: uint64x1_t, high: uint64x1_t) -> uint64x2_t {
-    simd_shuffle2(low, high, [0, 1])
+    simd_shuffle2!(low, high, [0, 1])
 }

 /// Vector combine
@@ -1667,7 +1667,7 @@ pub unsafe fn vcombine_u64(low: uint64x1_t, high: uint64x1_t) -> uint64x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(mov))]
 pub unsafe fn vcombine_p64(low: poly64x1_t, high: poly64x1_t) -> poly64x2_t {
-    simd_shuffle2(low, high, [0, 1])
+    simd_shuffle2!(low, high, [0, 1])
 }

 /// Duplicate vector element to vector or scalar
@@ -1772,7 +1772,7 @@ pub unsafe fn vget_low_p64(a: poly64x2_t) -> poly64x1_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(mov))]
 pub unsafe fn vcombine_f16 ( low: float16x4_t,  high: float16x4_t) -> float16x8_t {
-    simd_shuffle8(low, high, [0, 1, 2, 3, 4, 5, 6, 7])
+    simd_shuffle8!(low, high, [0, 1, 2, 3, 4, 5, 6, 7])
 }
 */

@@ -1781,7 +1781,7 @@ pub unsafe fn vcombine_f16 ( low: float16x4_t,  high: float16x4_t) -> float16x8_
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(mov))]
 pub unsafe fn vcombine_f32(low: float32x2_t, high: float32x2_t) -> float32x4_t {
-    simd_shuffle4(low, high, [0, 1, 2, 3])
+    simd_shuffle4!(low, high, [0, 1, 2, 3])
 }

 /// Vector combine
@@ -1789,7 +1789,7 @@ pub unsafe fn vcombine_f32(low: float32x2_t, high: float32x2_t) -> float32x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(mov))]
 pub unsafe fn vcombine_p8(low: poly8x8_t, high: poly8x8_t) -> poly8x16_t {
-    simd_shuffle16(
+    simd_shuffle16!(
        low,
        high,
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
@@ -1801,7 +1801,7 @@ pub unsafe fn vcombine_p8(low: poly8x8_t, high: poly8x8_t) -> poly8x16_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(mov))]
 pub unsafe fn vcombine_p16(low: poly16x4_t, high: poly16x4_t) -> poly16x8_t {
-    simd_shuffle8(low, high, [0, 1, 2, 3, 4, 5, 6, 7])
+    simd_shuffle8!(low, high, [0, 1, 2, 3, 4, 5, 6, 7])
 }

 /// Vector combine
@@ -1809,7 +1809,7 @@ pub unsafe fn vcombine_p16(low: poly16x4_t, high: poly16x4_t) -> poly16x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(mov))]
 pub unsafe fn vcombine_f64(low: float64x1_t, high: float64x1_t) -> float64x2_t {
-    simd_shuffle2(low, high, [0, 1])
+    simd_shuffle2!(low, high, [0, 1])
 }

 /// Table look-up
@@ -2449,7 +2449,7 @@ pub unsafe fn vcvtq_u32_f32(a: float32x4_t) -> uint32x4_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vdup_lane_s8<const N: i32>(a: int8x8_t) -> int8x8_t {
    static_assert_imm3!(N);
-    simd_shuffle8(a, a, [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
+    simd_shuffle8!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
 }

 /// Set all vector lanes to the same value
@@ -2461,7 +2461,7 @@ pub unsafe fn vdup_lane_s8<const N: i32>(a: int8x8_t) -> int8x8_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vdupq_laneq_s8<const N: i32>(a: int8x16_t) -> int8x16_t {
    static_assert_imm4!(N);
-    simd_shuffle16(a, a, [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
+    simd_shuffle16!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
 }

 /// Set all vector lanes to the same value
@@ -2473,7 +2473,7 @@ pub unsafe fn vdupq_laneq_s8<const N: i32>(a: int8x16_t) -> int8x16_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vdup_lane_s16<const N: i32>(a: int16x4_t) -> int16x4_t {
    static_assert_imm2!(N);
-    simd_shuffle4(a, a, [N as u32, N as u32, N as u32, N as u32])
+    simd_shuffle4!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32])
 }

 /// Set all vector lanes to the same value
@@ -2485,7 +2485,7 @@ pub unsafe fn vdup_lane_s16<const N: i32>(a: int16x4_t) -> int16x4_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vdupq_laneq_s16<const N: i32>(a: int16x8_t) -> int16x8_t {
    static_assert_imm3!(N);
-    simd_shuffle8(a, a, [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
+    simd_shuffle8!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
 }

 /// Set all vector lanes to the same value
@@ -2497,7 +2497,7 @@ pub unsafe fn vdupq_laneq_s16<const N: i32>(a: int16x8_t) -> int16x8_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vdup_lane_s32<const N: i32>(a: int32x2_t) -> int32x2_t {
    static_assert_imm1!(N);
-    simd_shuffle2(a, a, [N as u32, N as u32])
+    simd_shuffle2!(a, a, <const N: i32> [N as u32, N as u32])
 }

 /// Set all vector lanes to the same value
@@ -2509,7 +2509,7 @@ pub unsafe fn vdup_lane_s32<const N: i32>(a: int32x2_t) -> int32x2_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vdupq_laneq_s32<const N: i32>(a: int32x4_t) -> int32x4_t {
    static_assert_imm2!(N);
-    simd_shuffle4(a, a, [N as u32, N as u32, N as u32, N as u32])
+    simd_shuffle4!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32])
 }

 /// Set all vector lanes to the same value
@@ -2521,7 +2521,7 @@ pub unsafe fn vdupq_laneq_s32<const N: i32>(a: int32x4_t) -> int32x4_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vdup_laneq_s8<const N: i32>(a: int8x16_t) -> int8x8_t {
    static_assert_imm4!(N);
-    simd_shuffle8(a, a, [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
+    simd_shuffle8!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
 }

 /// Set all vector lanes to the same value
@@ -2533,7 +2533,7 @@ pub unsafe fn vdup_laneq_s8<const N: i32>(a: int8x16_t) -> int8x8_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vdup_laneq_s16<const N: i32>(a: int16x8_t) -> int16x4_t {
    static_assert_imm3!(N);
-    simd_shuffle4(a, a, [N as u32, N as u32, N as u32, N as u32])
+    simd_shuffle4!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32])
 }

 /// Set all vector lanes to the same value
@@ -2545,7 +2545,7 @@ pub unsafe fn vdup_laneq_s16<const N: i32>(a: int16x8_t) -> int16x4_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vdup_laneq_s32<const N: i32>(a: int32x4_t) -> int32x2_t {
    static_assert_imm2!(N);
-    simd_shuffle2(a, a, [N as u32, N as u32])
+    simd_shuffle2!(a, a, <const N: i32> [N as u32, N as u32])
 }

 /// Set all vector lanes to the same value
@@ -2557,7 +2557,7 @@ pub unsafe fn vdup_laneq_s32<const N: i32>(a: int32x4_t) -> int32x2_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vdupq_lane_s8<const N: i32>(a: int8x8_t) -> int8x16_t {
    static_assert_imm3!(N);
-    simd_shuffle16(a, a, [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
+    simd_shuffle16!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
 }

 /// Set all vector lanes to the same value
@@ -2569,7 +2569,7 @@ pub unsafe fn vdupq_lane_s8<const N: i32>(a: int8x8_t) -> int8x16_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vdupq_lane_s16<const N: i32>(a: int16x4_t) -> int16x8_t {
    static_assert_imm2!(N);
-    simd_shuffle8(a, a, [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
+    simd_shuffle8!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
 }

 /// Set all vector lanes to the same value
@@ -2581,7 +2581,7 @@ pub unsafe fn vdupq_lane_s16<const N: i32>(a: int16x4_t) -> int16x8_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vdupq_lane_s32<const N: i32>(a: int32x2_t) -> int32x4_t {
    static_assert_imm1!(N);
-    simd_shuffle4(a, a, [N as u32, N as u32, N as u32, N as u32])
+    simd_shuffle4!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32])
 }

 /// Set all vector lanes to the same value
@@ -2593,7 +2593,7 @@ pub unsafe fn vdupq_lane_s32<const N: i32>(a: int32x2_t) -> int32x4_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vdup_lane_u8<const N: i32>(a: uint8x8_t) -> uint8x8_t {
    static_assert_imm3!(N);
-    simd_shuffle8(a, a, [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
+    simd_shuffle8!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
 }

 /// Set all vector lanes to the same value
@@ -2605,7 +2605,7 @@ pub unsafe fn vdup_lane_u8<const N: i32>(a: uint8x8_t) -> uint8x8_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vdupq_laneq_u8<const N: i32>(a: uint8x16_t) -> uint8x16_t {
    static_assert_imm4!(N);
-    simd_shuffle16(a, a, [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
+    simd_shuffle16!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
 }

 /// Set all vector lanes to the same value
@@ -2617,7 +2617,7 @@ pub unsafe fn vdupq_laneq_u8<const N: i32>(a: uint8x16_t) -> uint8x16_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vdup_lane_u16<const N: i32>(a: uint16x4_t) -> uint16x4_t {
    static_assert_imm2!(N);
-    simd_shuffle4(a, a, [N as u32, N as u32, N as u32, N as u32])
+    simd_shuffle4!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32])
 }

 /// Set all vector lanes to the same value
@@ -2629,7 +2629,7 @@ pub unsafe fn vdup_lane_u16<const N: i32>(a: uint16x4_t) -> uint16x4_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vdupq_laneq_u16<const N: i32>(a: uint16x8_t) -> uint16x8_t {
    static_assert_imm3!(N);
-    simd_shuffle8(a, a, [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
+    simd_shuffle8!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
 }

 /// Set all vector lanes to the same value
@@ -2641,7 +2641,7 @@ pub unsafe fn vdupq_laneq_u16<const N: i32>(a: uint16x8_t) -> uint16x8_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vdup_lane_u32<const N: i32>(a: uint32x2_t) -> uint32x2_t {
    static_assert_imm1!(N);
-    simd_shuffle2(a, a, [N as u32, N as u32])
+    simd_shuffle2!(a, a, <const N: i32> [N as u32, N as u32])
 }

 /// Set all vector lanes to the same value
@@ -2653,7 +2653,7 @@ pub unsafe fn vdup_lane_u32<const N: i32>(a: uint32x2_t) -> uint32x2_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vdupq_laneq_u32<const N: i32>(a: uint32x4_t) -> uint32x4_t {
    static_assert_imm2!(N);
-    simd_shuffle4(a, a, [N as u32, N as u32, N as u32, N as u32])
+    simd_shuffle4!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32])
 }

 /// Set all vector lanes to the same value
@@ -2665,7 +2665,7 @@ pub unsafe fn vdupq_laneq_u32<const N: i32>(a: uint32x4_t) -> uint32x4_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vdup_laneq_u8<const N: i32>(a: uint8x16_t) -> uint8x8_t {
    static_assert_imm4!(N);
-    simd_shuffle8(a, a, [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
+    simd_shuffle8!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
 }

 /// Set all vector lanes to the same value
@@ -2677,7 +2677,7 @@ pub unsafe fn vdup_laneq_u8<const N: i32>(a: uint8x16_t) -> uint8x8_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vdup_laneq_u16<const N: i32>(a: uint16x8_t) -> uint16x4_t {
    static_assert_imm3!(N);
-    simd_shuffle4(a, a, [N as u32, N as u32, N as u32, N as u32])
+    simd_shuffle4!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32])
 }

 /// Set all vector lanes to the same value
@@ -2689,7 +2689,7 @@ pub unsafe fn vdup_laneq_u16<const N: i32>(a: uint16x8_t) -> uint16x4_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vdup_laneq_u32<const N: i32>(a: uint32x4_t) -> uint32x2_t {
    static_assert_imm2!(N);
-    simd_shuffle2(a, a, [N as u32, N as u32])
+    simd_shuffle2!(a, a, <const N: i32> [N as u32, N as u32])
 }

 /// Set all vector lanes to the same value
@@ -2701,7 +2701,7 @@ pub unsafe fn vdup_laneq_u32<const N: i32>(a: uint32x4_t) -> uint32x2_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vdupq_lane_u8<const N: i32>(a: uint8x8_t) -> uint8x16_t {
    static_assert_imm3!(N);
-    simd_shuffle16(a, a, [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
+    simd_shuffle16!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
 }

 /// Set all vector lanes to the same value
@@ -2713,7 +2713,7 @@ pub unsafe fn vdupq_lane_u8<const N: i32>(a: uint8x8_t) -> uint8x16_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vdupq_lane_u16<const N: i32>(a: uint16x4_t) -> uint16x8_t {
    static_assert_imm2!(N);
-    simd_shuffle8(a, a, [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
+    simd_shuffle8!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
 }

 /// Set all vector lanes to the same value
@@ -2725,7 +2725,7 @@ pub unsafe fn vdupq_lane_u16<const N: i32>(a: uint16x4_t) -> uint16x8_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vdupq_lane_u32<const N: i32>(a: uint32x2_t) -> uint32x4_t {
    static_assert_imm1!(N);
-    simd_shuffle4(a, a, [N as u32, N as u32, N as u32, N as u32])
+    simd_shuffle4!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32])
 }

 /// Set all vector lanes to the same value
@@ -2737,7 +2737,7 @@ pub unsafe fn vdupq_lane_u32<const N: i32>(a: uint32x2_t) -> uint32x4_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vdup_lane_p8<const N: i32>(a: poly8x8_t) -> poly8x8_t {
    static_assert_imm3!(N);
-    simd_shuffle8(a, a, [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
+    simd_shuffle8!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
 }

 /// Set all vector lanes to the same value
@@ -2749,7 +2749,7 @@ pub unsafe fn vdup_lane_p8<const N: i32>(a: poly8x8_t) -> poly8x8_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vdupq_laneq_p8<const N: i32>(a: poly8x16_t) -> poly8x16_t {
    static_assert_imm4!(N);
-    simd_shuffle16(a, a, [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
+    simd_shuffle16!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
 }

 /// Set all vector lanes to the same value
@@ -2761,7 +2761,7 @@ pub unsafe fn vdupq_laneq_p8<const N: i32>(a: poly8x16_t) -> poly8x16_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vdup_lane_p16<const N: i32>(a: poly16x4_t) -> poly16x4_t {
    static_assert_imm2!(N);
-    simd_shuffle4(a, a, [N as u32, N as u32, N as u32, N as u32])
+    simd_shuffle4!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32])
 }

 /// Set all vector lanes to the same value
@@ -2773,7 +2773,7 @@ pub unsafe fn vdup_lane_p16<const N: i32>(a: poly16x4_t) -> poly16x4_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vdupq_laneq_p16<const N: i32>(a: poly16x8_t) -> poly16x8_t {
    static_assert_imm3!(N);
-    simd_shuffle8(a, a, [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
+    simd_shuffle8!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
 }

 /// Set all vector lanes to the same value
@@ -2785,7 +2785,7 @@ pub unsafe fn vdupq_laneq_p16<const N: i32>(a: poly16x8_t) -> poly16x8_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vdup_laneq_p8<const N: i32>(a: poly8x16_t) -> poly8x8_t {
    static_assert_imm4!(N);
-    simd_shuffle8(a, a, [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
+    simd_shuffle8!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
 }

 /// Set all vector lanes to the same value
@@ -2797,7 +2797,7 @@ pub unsafe fn vdup_laneq_p8<const N: i32>(a: poly8x16_t) -> poly8x8_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vdup_laneq_p16<const N: i32>(a: poly16x8_t) -> poly16x4_t {
    static_assert_imm3!(N);
-    simd_shuffle4(a, a, [N as u32, N as u32, N as u32, N as u32])
+    simd_shuffle4!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32])
 }

 /// Set all vector lanes to the same value
@@ -2809,7 +2809,7 @@ pub unsafe fn vdup_laneq_p16<const N: i32>(a: poly16x8_t) -> poly16x4_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vdupq_lane_p8<const N: i32>(a: poly8x8_t) -> poly8x16_t {
    static_assert_imm3!(N);
-    simd_shuffle16(a, a, [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
+    simd_shuffle16!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
 }

 /// Set all vector lanes to the same value
@@ -2821,7 +2821,7 @@ pub unsafe fn vdupq_lane_p8<const N: i32>(a: poly8x8_t) -> poly8x16_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vdupq_lane_p16<const N: i32>(a: poly16x4_t) -> poly16x8_t {
    static_assert_imm2!(N);
-    simd_shuffle8(a, a, [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
+    simd_shuffle8!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32])
 }

 /// Set all vector lanes to the same value
@@ -2833,7 +2833,7 @@ pub unsafe fn vdupq_lane_p16<const N: i32>(a: poly16x4_t) -> poly16x8_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vdupq_laneq_s64<const N: i32>(a: int64x2_t) -> int64x2_t {
    static_assert_imm1!(N);
-    simd_shuffle2(a, a, [N as u32, N as u32])
+    simd_shuffle2!(a, a, <const N: i32> [N as u32, N as u32])
 }

 /// Set all vector lanes to the same value
@@ -2845,7 +2845,7 @@ pub unsafe fn vdupq_laneq_s64<const N: i32>(a: int64x2_t) -> int64x2_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vdupq_lane_s64<const N: i32>(a: int64x1_t) -> int64x2_t {
    static_assert!(N : i32 where N == 0);
-    simd_shuffle2(a, a, [N as u32, N as u32])
+    simd_shuffle2!(a, a, <const N: i32> [N as u32, N as u32])
 }

 /// Set all vector lanes to the same value
@@ -2857,7 +2857,7 @@ pub unsafe fn vdupq_lane_s64<const N: i32>(a: int64x1_t) -> int64x2_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vdupq_laneq_u64<const N: i32>(a: uint64x2_t) -> uint64x2_t {
    static_assert_imm1!(N);
-    simd_shuffle2(a, a, [N as u32, N as u32])
+    simd_shuffle2!(a, a, <const N: i32> [N as u32, N as u32])
 }

 /// Set all vector lanes to the same value
@@ -2869,7 +2869,7 @@ pub unsafe fn vdupq_laneq_u64<const N: i32>(a: uint64x2_t) -> uint64x2_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vdupq_lane_u64<const N: i32>(a: uint64x1_t) -> uint64x2_t {
    static_assert!(N : i32 where N == 0);
-    simd_shuffle2(a, a, [N as u32, N as u32])
+    simd_shuffle2!(a, a, <const N: i32> [N as u32, N as u32])
 }

 /// Set all vector lanes to the same value
@@ -2881,7 +2881,7 @@ pub unsafe fn vdupq_lane_u64<const N: i32>(a: uint64x1_t) -> uint64x2_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vdup_lane_f32<const N: i32>(a: float32x2_t) -> float32x2_t {
    static_assert_imm1!(N);
-    simd_shuffle2(a, a, [N as u32, N as u32])
+    simd_shuffle2!(a, a, <const N: i32> [N as u32, N as u32])
 }

 /// Set all vector lanes to the same value
@@ -2893,7 +2893,7 @@ pub unsafe fn vdup_lane_f32<const N: i32>(a: float32x2_t) -> float32x2_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vdupq_laneq_f32<const N: i32>(a: float32x4_t) -> float32x4_t {
    static_assert_imm2!(N);
-    simd_shuffle4(a, a, [N as u32, N as u32, N as u32, N as u32])
+    simd_shuffle4!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32])
 }

 /// Set all vector lanes to the same value
@@ -2905,7 +2905,7 @@ pub unsafe fn vdupq_laneq_f32<const N: i32>(a: float32x4_t) -> float32x4_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vdup_laneq_f32<const N: i32>(a: float32x4_t) -> float32x2_t {
    static_assert_imm2!(N);
-    simd_shuffle2(a, a, [N as u32, N as u32])
+    simd_shuffle2!(a, a, <const N: i32> [N as u32, N as u32])
 }

 /// Set all vector lanes to the same value
@@ -2917,7 +2917,7 @@ pub unsafe fn vdup_laneq_f32<const N: i32>(a: float32x4_t) -> float32x2_t {
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn vdupq_lane_f32<const N: i32>(a: float32x2_t) -> float32x4_t {
    static_assert_imm1!(N);
-    simd_shuffle4(a, a, [N as u32, N as u32, N as u32, N as u32])
+    simd_shuffle4!(a, a, <const N: i32> [N as u32, N as u32, N as u32, N as u32])
 }

 /// Set all vector lanes to the same value
@@ -2978,14 +2978,14 @@ pub unsafe fn vdup_laneq_u64<const N: i32>(a: uint64x2_t) -> uint64x1_t {
 pub unsafe fn vext_s8<const N: i32>(a: int8x8_t, b: int8x8_t) -> int8x8_t {
    static_assert_imm3!(N);
    match N & 0b111 {
-        0 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 5, 6, 7]),
-        1 => simd_shuffle8(a, b, [1, 2, 3, 4, 5, 6, 7, 8]),
-        2 => simd_shuffle8(a, b, [2, 3, 4, 5, 6, 7, 8, 9]),
-        3 => simd_shuffle8(a, b, [3, 4, 5, 6, 7, 8, 9, 10]),
-        4 => simd_shuffle8(a, b, [4, 5, 6, 7, 8, 9, 10, 11]),
-        5 => simd_shuffle8(a, b, [5, 6, 7, 8, 9, 10, 11, 12]),
-        6 => simd_shuffle8(a, b, [6, 7, 8, 9, 10, 11, 12, 13]),
-        7 => simd_shuffle8(a, b, [7, 8, 9, 10, 11, 12, 13, 14]),
+        0 => simd_shuffle8!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]),
+        1 => simd_shuffle8!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]),
+        2 => simd_shuffle8!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]),
+        3 => simd_shuffle8!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]),
+        4 => simd_shuffle8!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]),
+        5 => simd_shuffle8!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]),
+        6 => simd_shuffle8!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]),
+        7 => simd_shuffle8!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]),
        _ => unreachable_unchecked(),
    }
 }
@@ -3000,22 +3000,22 @@ pub unsafe fn vext_s8<const N: i32>(a: int8x8_t, b: int8x8_t) -> int8x8_t {
 pub unsafe fn vextq_s8<const N: i32>(a: int8x16_t, b: int8x16_t) -> int8x16_t {
    static_assert_imm4!(N);
    match N & 0b1111 {
-        0 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
-        1 => simd_shuffle16(a, b, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]),
-        2 => simd_shuffle16(a, b, [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]),
-        3 => simd_shuffle16(a, b, [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]),
-        4 => simd_shuffle16(a, b, [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]),
-        5 => simd_shuffle16(a, b, [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]),
-        6 => simd_shuffle16(a, b, [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]),
-        7 => simd_shuffle16(a, b, [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]),
-        8 => simd_shuffle16(a, b, [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]),
-        9 => simd_shuffle16(a, b, [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]),
-        10 => simd_shuffle16(a, b, [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]),
-        11 => simd_shuffle16(a, b, [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]),
-        12 => simd_shuffle16(a, b, [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]),
-        13 => simd_shuffle16(a, b, [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28]),
-        14 => simd_shuffle16(a, b, [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]),
-        15 => simd_shuffle16(a, b, [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]),
+        0 => simd_shuffle16!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        1 => simd_shuffle16!(a, b, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]),
+        2 => simd_shuffle16!(a, b, [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]),
+        3 => simd_shuffle16!(a, b, [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]),
+        4 => simd_shuffle16!(a, b, [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]),
+        5 => simd_shuffle16!(a, b, [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]),
+        6 => simd_shuffle16!(a, b, [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]),
+        7 => simd_shuffle16!(a, b, [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]),
+        8 => simd_shuffle16!(a, b, [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]),
+        9 => simd_shuffle16!(a, b, [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]),
+        10 => simd_shuffle16!(a, b, [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]),
+        11 => simd_shuffle16!(a, b, [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]),
+        12 => simd_shuffle16!(a, b, [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]),
+        13 => simd_shuffle16!(a, b, [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28]),
+        14 => simd_shuffle16!(a, b, [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]),
+        15 => simd_shuffle16!(a, b, [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]),
        _ => unreachable_unchecked(),
    }
 }
@@ -3030,10 +3030,10 @@ pub unsafe fn vextq_s8<const N: i32>(a: int8x16_t, b: int8x16_t) -> int8x16_t {
 pub unsafe fn vext_s16<const N: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
    static_assert_imm2!(N);
    match N & 0b11 {
-        0 => simd_shuffle4(a, b, [0, 1, 2, 3]),
-        1 => simd_shuffle4(a, b, [1, 2, 3, 4]),
-        2 => simd_shuffle4(a, b, [2, 3, 4, 5]),
-        3 => simd_shuffle4(a, b, [3, 4, 5, 6]),
+        0 => simd_shuffle4!(a, b, [0, 1, 2, 3]),
+        1 => simd_shuffle4!(a, b, [1, 2, 3, 4]),
+        2 => simd_shuffle4!(a, b, [2, 3, 4, 5]),
+        3 => simd_shuffle4!(a, b, [3, 4, 5, 6]),
        _ => unreachable_unchecked(),
    }
 }
@@ -3048,14 +3048,14 @@ pub unsafe fn vext_s16<const N: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
 pub unsafe fn vextq_s16<const N: i32>(a: int16x8_t, b: int16x8_t) -> int16x8_t {
    static_assert_imm3!(N);
    match N & 0b111 {
-        0 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 5, 6, 7]),
-        1 => simd_shuffle8(a, b, [1, 2, 3, 4, 5, 6, 7, 8]),
-        2 => simd_shuffle8(a, b, [2, 3, 4, 5, 6, 7, 8, 9]),
-        3 => simd_shuffle8(a, b, [3, 4, 5, 6, 7, 8, 9, 10]),
-        4 => simd_shuffle8(a, b, [4, 5, 6, 7, 8, 9, 10, 11]),
-        5 => simd_shuffle8(a, b, [5, 6, 7, 8, 9, 10, 11, 12]),
-        6 => simd_shuffle8(a, b, [6, 7, 8, 9, 10, 11, 12, 13]),
-        7 => simd_shuffle8(a, b, [7, 8, 9, 10, 11, 12, 13, 14]),
+        0 => simd_shuffle8!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]),
+        1 => simd_shuffle8!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]),
+        2 => simd_shuffle8!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]),
+        3 => simd_shuffle8!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]),
+        4 => simd_shuffle8!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]),
+        5 => simd_shuffle8!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]),
+        6 => simd_shuffle8!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]),
+        7 => simd_shuffle8!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]),
        _ => unreachable_unchecked(),
    }
 }
@@ -3070,8 +3070,8 @@ pub unsafe fn vextq_s16<const N: i32>(a: int16x8_t, b: int16x8_t) -> int16x8_t {
 pub unsafe fn vext_s32<const N: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
    static_assert_imm1!(N);
    match N & 0b1 {
-        0 => simd_shuffle2(a, b, [0, 1]),
-        1 => simd_shuffle2(a, b, [1, 2]),
+        0 => simd_shuffle2!(a, b, [0, 1]),
+        1 => simd_shuffle2!(a, b, [1, 2]),
        _ => unreachable_unchecked(),
    }
 }
@@ -3086,10 +3086,10 @@ pub unsafe fn vext_s32<const N: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
 pub unsafe fn vextq_s32<const N: i32>(a: int32x4_t, b: int32x4_t) -> int32x4_t {
    static_assert_imm2!(N);
    match N & 0b11 {
-        0 => simd_shuffle4(a, b, [0, 1, 2, 3]),
-        1 => simd_shuffle4(a, b, [1, 2, 3, 4]),
-        2 => simd_shuffle4(a, b, [2, 3, 4, 5]),
-        3 => simd_shuffle4(a, b, [3, 4, 5, 6]),
+        0 => simd_shuffle4!(a, b, [0, 1, 2, 3]),
+        1 => simd_shuffle4!(a, b, [1, 2, 3, 4]),
+        2 => simd_shuffle4!(a, b, [2, 3, 4, 5]),
+        3 => simd_shuffle4!(a, b, [3, 4, 5, 6]),
        _ => unreachable_unchecked(),
    }
 }
@@ -3104,14 +3104,14 @@ pub unsafe fn vextq_s32<const N: i32>(a: int32x4_t, b: int32x4_t) -> int32x4_t {
 pub unsafe fn vext_u8<const N: i32>(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
    static_assert_imm3!(N);
    match N & 0b111 {
-        0 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 5, 6, 7]),
-        1 => simd_shuffle8(a, b, [1, 2, 3, 4, 5, 6, 7, 8]),
-        2 => simd_shuffle8(a, b, [2, 3, 4, 5, 6, 7, 8, 9]),
-        3 => simd_shuffle8(a, b, [3, 4, 5, 6, 7, 8, 9, 10]),
-        4 => simd_shuffle8(a, b, [4, 5, 6, 7, 8, 9, 10, 11]),
-        5 => simd_shuffle8(a, b, [5, 6, 7, 8, 9, 10, 11, 12]),
-        6 => simd_shuffle8(a, b, [6, 7, 8, 9, 10, 11, 12, 13]),
-        7 => simd_shuffle8(a, b, [7, 8, 9, 10, 11, 12, 13, 14]),
+        0 => simd_shuffle8!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]),
+        1 => simd_shuffle8!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]),
+        2 => simd_shuffle8!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]),
+        3 => simd_shuffle8!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]),
+        4 => simd_shuffle8!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]),
+        5 => simd_shuffle8!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]),
+        6 => simd_shuffle8!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]),
+        7 => simd_shuffle8!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]),
        _ => unreachable_unchecked(),
    }
 }
@@ -3126,22 +3126,22 @@ pub unsafe fn vext_u8<const N: i32>(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
 pub unsafe fn vextq_u8<const N: i32>(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
    static_assert_imm4!(N);
    match N & 0b1111 {
-        0 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
-        1 => simd_shuffle16(a, b, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]),
-        2 => simd_shuffle16(a, b, [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]),
-        3 => simd_shuffle16(a, b, [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]),
-        4 => simd_shuffle16(a, b, [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]),
-        5 => simd_shuffle16(a, b, [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]),
-        6 => simd_shuffle16(a, b, [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]),
-        7 => simd_shuffle16(a, b, [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]),
-        8 => simd_shuffle16(a, b, [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]),
-        9 => simd_shuffle16(a, b, [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]),
-        10 => simd_shuffle16(a, b, [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]),
-        11 => simd_shuffle16(a, b, [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]),
-        12 => simd_shuffle16(a, b, [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]),
-        13 => simd_shuffle16(a, b, [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28]),
-        14 => simd_shuffle16(a, b, [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]),
-        15 => simd_shuffle16(a, b, [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]),
+        0 => simd_shuffle16!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        1 => simd_shuffle16!(a, b, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]),
+        2 => simd_shuffle16!(a, b, [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]),
+        3 => simd_shuffle16!(a, b, [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]),
+        4 => simd_shuffle16!(a, b, [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]),
+        5 => simd_shuffle16!(a, b, [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]),
+        6 => simd_shuffle16!(a, b, [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]),
+        7 => simd_shuffle16!(a, b, [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]),
+        8 => simd_shuffle16!(a, b, [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]),
+        9 => simd_shuffle16!(a, b, [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]),
+        10 => simd_shuffle16!(a, b, [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]),
+        11 => simd_shuffle16!(a, b, [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]),
+        12 => simd_shuffle16!(a, b, [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]),
+        13 => simd_shuffle16!(a, b, [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28]),
+        14 => simd_shuffle16!(a, b, [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]),
+        15 => simd_shuffle16!(a, b, [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]),
        _ => unreachable_unchecked(),
    }
 }
@@ -3156,10 +3156,10 @@ pub unsafe fn vextq_u8<const N: i32>(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t
 pub unsafe fn vext_u16<const N: i32>(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
    static_assert_imm2!(N);
    match N & 0b11 {
-        0 => simd_shuffle4(a, b, [0, 1, 2, 3]),
-        1 => simd_shuffle4(a, b, [1, 2, 3, 4]),
-        2 => simd_shuffle4(a, b, [2, 3, 4, 5]),
-        3 => simd_shuffle4(a, b, [3, 4, 5, 6]),
+        0 => simd_shuffle4!(a, b, [0, 1, 2, 3]),
+        1 => simd_shuffle4!(a, b, [1, 2, 3, 4]),
+        2 => simd_shuffle4!(a, b, [2, 3, 4, 5]),
+        3 => simd_shuffle4!(a, b, [3, 4, 5, 6]),
        _ => unreachable_unchecked(),
    }
 }
@@ -3174,14 +3174,14 @@ pub unsafe fn vext_u16<const N: i32>(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t
 pub unsafe fn vextq_u16<const N: i32>(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
    static_assert_imm3!(N);
    match N & 0b111 {
-        0 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 5, 6, 7]),
-        1 => simd_shuffle8(a, b, [1, 2, 3, 4, 5, 6, 7, 8]),
-        2 => simd_shuffle8(a, b, [2, 3, 4, 5, 6, 7, 8, 9]),
-        3 => simd_shuffle8(a, b, [3, 4, 5, 6, 7, 8, 9, 10]),
-        4 => simd_shuffle8(a, b, [4, 5, 6, 7, 8, 9, 10, 11]),
-        5 => simd_shuffle8(a, b, [5, 6, 7, 8, 9, 10, 11, 12]),
-        6 => simd_shuffle8(a, b, [6, 7, 8, 9, 10, 11, 12, 13]),
-        7 => simd_shuffle8(a, b, [7, 8, 9, 10, 11, 12, 13, 14]),
+        0 => simd_shuffle8!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]),
+        1 => simd_shuffle8!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]),
+        2 => simd_shuffle8!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]),
+        3 => simd_shuffle8!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]),
+        4 => simd_shuffle8!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]),
+        5 => simd_shuffle8!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]),
+        6 => simd_shuffle8!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]),
+        7 => simd_shuffle8!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]),
        _ => unreachable_unchecked(),
    }
 }
@@ -3196,8 +3196,8 @@ pub unsafe fn vextq_u16<const N: i32>(a: uint16x8_t, b: uint16x8_t) -> uint16x8_
 pub unsafe fn vext_u32<const N: i32>(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
    static_assert_imm1!(N);
    match N & 0b1 {
-        0 => simd_shuffle2(a, b, [0, 1]),
-        1 => simd_shuffle2(a, b, [1, 2]),
+        0 => simd_shuffle2!(a, b, [0, 1]),
+        1 => simd_shuffle2!(a, b, [1, 2]),
        _ => unreachable_unchecked(),
    }
 }
@@ -3212,10 +3212,10 @@ pub unsafe fn vext_u32<const N: i32>(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t
 pub unsafe fn vextq_u32<const N: i32>(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
    static_assert_imm2!(N);
    match N & 0b11 {
-        0 => simd_shuffle4(a, b, [0, 1, 2, 3]),
-        1 => simd_shuffle4(a, b, [1, 2, 3, 4]),
-        2 => simd_shuffle4(a, b, [2, 3, 4, 5]),
-        3 => simd_shuffle4(a, b, [3, 4, 5, 6]),
+        0 => simd_shuffle4!(a, b, [0, 1, 2, 3]),
+        1 => simd_shuffle4!(a, b, [1, 2, 3, 4]),
+        2 => simd_shuffle4!(a, b, [2, 3, 4, 5]),
+        3 => simd_shuffle4!(a, b, [3, 4, 5, 6]),
        _ => unreachable_unchecked(),
    }
 }
@@ -3230,14 +3230,14 @@ pub unsafe fn vextq_u32<const N: i32>(a: uint32x4_t, b: uint32x4_t) -> uint32x4_
 pub unsafe fn vext_p8<const N: i32>(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
    static_assert_imm3!(N);
    match N & 0b111 {
-        0 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 5, 6, 7]),
-        1 => simd_shuffle8(a, b, [1, 2, 3, 4, 5, 6, 7, 8]),
-        2 => simd_shuffle8(a, b, [2, 3, 4, 5, 6, 7, 8, 9]),
-        3 => simd_shuffle8(a, b, [3, 4, 5, 6, 7, 8, 9, 10]),
-        4 => simd_shuffle8(a, b, [4, 5, 6, 7, 8, 9, 10, 11]),
-        5 => simd_shuffle8(a, b, [5, 6, 7, 8, 9, 10, 11, 12]),
-        6 => simd_shuffle8(a, b, [6, 7, 8, 9, 10, 11, 12, 13]),
-        7 => simd_shuffle8(a, b, [7, 8, 9, 10, 11, 12, 13, 14]),
+        0 => simd_shuffle8!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]),
+        1 => simd_shuffle8!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]),
+        2 => simd_shuffle8!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]),
+        3 => simd_shuffle8!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]),
+        4 => simd_shuffle8!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]),
+        5 => simd_shuffle8!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]),
+        6 => simd_shuffle8!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]),
+        7 => simd_shuffle8!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]),
        _ => unreachable_unchecked(),
    }
 }
@@ -3252,22 +3252,22 @@ pub unsafe fn vext_p8<const N: i32>(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
 pub unsafe fn vextq_p8<const N: i32>(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
    static_assert_imm4!(N);
    match N & 0b1111 {
-        0 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
-        1 => simd_shuffle16(a, b, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]),
-        2 => simd_shuffle16(a, b, [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]),
-        3 => simd_shuffle16(a, b, [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]),
-        4 => simd_shuffle16(a, b, [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]),
-        5 => simd_shuffle16(a, b, [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]),
-        6 => simd_shuffle16(a, b, [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]),
-        7 => simd_shuffle16(a, b, [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]),
-        8 => simd_shuffle16(a, b, [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]),
-        9 => simd_shuffle16(a, b, [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]),
-        10 => simd_shuffle16(a, b, [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]),
-        11 => simd_shuffle16(a, b, [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]),
-        12 => simd_shuffle16(a, b, [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]),
-        13 => simd_shuffle16(a, b, [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28]),
-        14 => simd_shuffle16(a, b, [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]),
-        15 => simd_shuffle16(a, b, [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]),
+        0 => simd_shuffle16!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
+        1 => simd_shuffle16!(a, b, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]),
+        2 => simd_shuffle16!(a, b, [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]),
+        3 => simd_shuffle16!(a, b, [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]),
+        4 => simd_shuffle16!(a, b, [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]),
+        5 => simd_shuffle16!(a, b, [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]),
+        6 => simd_shuffle16!(a, b, [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]),
+        7 => simd_shuffle16!(a, b, [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]),
+        8 => simd_shuffle16!(a, b, [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]),
+        9 => simd_shuffle16!(a, b, [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]),
+        10 => simd_shuffle16!(a, b, [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]),
+        11 => simd_shuffle16!(a, b, [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]),
+        12 => simd_shuffle16!(a, b, [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]),
+        13 => simd_shuffle16!(a, b, [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28]),
+        14 => simd_shuffle16!(a, b, [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]),
+        15 => simd_shuffle16!(a, b, [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]),
        _ => unreachable_unchecked(),
    }
 }
@@ -3282,10 +3282,10 @@ pub unsafe fn vextq_p8<const N: i32>(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t
 pub unsafe fn vext_p16<const N: i32>(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t {
    static_assert_imm2!(N);
    match N & 0b11 {
-        0 => simd_shuffle4(a, b, [0, 1, 2, 3]),
-        1 => simd_shuffle4(a, b, [1, 2, 3, 4]),
-        2 => simd_shuffle4(a, b, [2, 3, 4, 5]),
-        3 => simd_shuffle4(a, b, [3, 4, 5, 6]),
+        0 => simd_shuffle4!(a, b, [0, 1, 2, 3]),
+        1 => simd_shuffle4!(a, b, [1, 2, 3, 4]),
+        2 => simd_shuffle4!(a, b, [2, 3, 4, 5]),
+        3 => simd_shuffle4!(a, b, [3, 4, 5, 6]),
        _ => unreachable_unchecked(),
    }
 }
@@ -3300,14 +3300,14 @@ pub unsafe fn vext_p16<const N: i32>(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t
 pub unsafe fn vextq_p16<const N: i32>(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t {
    static_assert_imm3!(N);
    match N & 0b111 {
-        0 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 5, 6, 7]),
-        1 => simd_shuffle8(a, b, [1, 2, 3, 4, 5, 6, 7, 8]),
-        2 => simd_shuffle8(a, b, [2, 3, 4, 5, 6, 7, 8, 9]),
-        3 => simd_shuffle8(a, b, [3, 4, 5, 6, 7, 8, 9, 10]),
-        4 => simd_shuffle8(a, b, [4, 5, 6, 7, 8, 9, 10, 11]),
-        5 => simd_shuffle8(a, b, [5, 6, 7, 8, 9, 10, 11, 12]),
-        6 => simd_shuffle8(a, b, [6, 7, 8, 9, 10, 11, 12, 13]),
-        7 => simd_shuffle8(a, b, [7, 8, 9, 10, 11, 12, 13, 14]),
+        0 => simd_shuffle8!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]),
+        1 => simd_shuffle8!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]),
+        2 => simd_shuffle8!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]),
+        3 => simd_shuffle8!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]),
+        4 => simd_shuffle8!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]),
+        5 => simd_shuffle8!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]),
+        6 => simd_shuffle8!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]),
+        7 => simd_shuffle8!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]),
        _ => unreachable_unchecked(),
    }
 }
@@ -3322,8 +3322,8 @@ pub unsafe fn vextq_p16<const N: i32>(a: poly16x8_t, b: poly16x8_t) -> poly16x8_
 pub unsafe fn vextq_s64<const N: i32>(a: int64x2_t, b: int64x2_t) -> int64x2_t {
    static_assert_imm1!(N);
    match N & 0b1 {
-        0 => simd_shuffle2(a, b, [0, 1]),
-        1 => simd_shuffle2(a, b, [1, 2]),
+        0 => simd_shuffle2!(a, b, [0, 1]),
+        1 => simd_shuffle2!(a, b, [1, 2]),
        _ => unreachable_unchecked(),
    }
 }
@@ -3338,8 +3338,8 @@ pub unsafe fn vextq_s64<const N: i32>(a: int64x2_t, b: int64x2_t) -> int64x2_t {
 pub unsafe fn vextq_u64<const N: i32>(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
    static_assert_imm1!(N);
    match N & 0b1 {
-        0 => simd_shuffle2(a, b, [0, 1]),
-        1 => simd_shuffle2(a, b, [1, 2]),
+        0 => simd_shuffle2!(a, b, [0, 1]),
+        1 => simd_shuffle2!(a, b, [1, 2]),
        _ => unreachable_unchecked(),
    }
 }
@@ -3354,8 +3354,8 @@ pub unsafe fn vextq_u64<const N: i32>(a: uint64x2_t, b: uint64x2_t) -> uint64x2_
 pub unsafe fn vext_f32<const N: i32>(a: float32x2_t, b: float32x2_t) -> float32x2_t {
    static_assert_imm1!(N);
    match N & 0b1 {
-        0 => simd_shuffle2(a, b, [0, 1]),
-        1 => simd_shuffle2(a, b, [1, 2]),
+        0 => simd_shuffle2!(a, b, [0, 1]),
+        1 => simd_shuffle2!(a, b, [1, 2]),
        _ => unreachable_unchecked(),
    }
 }
@@ -3370,10 +3370,10 @@ pub unsafe fn vext_f32<const N: i32>(a: float32x2_t, b: float32x2_t) -> float32x
 pub unsafe fn vextq_f32<const N: i32>(a: float32x4_t, b: float32x4_t) -> float32x4_t {
    static_assert_imm2!(N);
    match N & 0b11 {
-        0 => simd_shuffle4(a, b, [0, 1, 2, 3]),
-        1 => simd_shuffle4(a, b, [1, 2, 3, 4]),
-        2 => simd_shuffle4(a, b, [2, 3, 4, 5]),
-        3 => simd_shuffle4(a, b, [3, 4, 5, 6]),
+        0 => simd_shuffle4!(a, b, [0, 1, 2, 3]),
+        1 => simd_shuffle4!(a, b, [1, 2, 3, 4]),
+        2 => simd_shuffle4!(a, b, [2, 3, 4, 5]),
+        3 => simd_shuffle4!(a, b, [3, 4, 5, 6]),
        _ => unreachable_unchecked(),
    }
 }
@@ -3627,7 +3627,7 @@ pub unsafe fn vmlaq_n_f32(a: float32x4_t, b: float32x4_t, c: f32) -> float32x4_t
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmla_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t {
    static_assert_imm2!(LANE);
-    vmla_s16(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmla_s16(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Vector multiply accumulate with scalar
@@ -3639,7 +3639,7 @@ pub unsafe fn vmla_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int1
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmla_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int16x8_t) -> int16x4_t {
    static_assert_imm3!(LANE);
-    vmla_s16(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmla_s16(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Vector multiply accumulate with scalar
@@ -3651,7 +3651,7 @@ pub unsafe fn vmla_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlaq_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: int16x4_t) -> int16x8_t {
    static_assert_imm2!(LANE);
-    vmlaq_s16(a, b, simd_shuffle8(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmlaq_s16(a, b, simd_shuffle8!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Vector multiply accumulate with scalar
@@ -3663,7 +3663,7 @@ pub unsafe fn vmlaq_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: int
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlaq_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
    static_assert_imm3!(LANE);
-    vmlaq_s16(a, b, simd_shuffle8(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmlaq_s16(a, b, simd_shuffle8!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Vector multiply accumulate with scalar
@@ -3675,7 +3675,7 @@ pub unsafe fn vmlaq_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: in
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmla_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t {
    static_assert_imm1!(LANE);
-    vmla_s32(a, b, simd_shuffle2(c, c, [LANE as u32, LANE as u32]))
+    vmla_s32(a, b, simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]))
 }

 /// Vector multiply accumulate with scalar
@@ -3687,7 +3687,7 @@ pub unsafe fn vmla_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int3
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmla_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int32x4_t) -> int32x2_t {
    static_assert_imm2!(LANE);
-    vmla_s32(a, b, simd_shuffle2(c, c, [LANE as u32, LANE as u32]))
+    vmla_s32(a, b, simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]))
 }

 /// Vector multiply accumulate with scalar
@@ -3699,7 +3699,7 @@ pub unsafe fn vmla_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlaq_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c: int32x2_t) -> int32x4_t {
    static_assert_imm1!(LANE);
-    vmlaq_s32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmlaq_s32(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Vector multiply accumulate with scalar
@@ -3711,7 +3711,7 @@ pub unsafe fn vmlaq_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c: int
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlaq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
    static_assert_imm2!(LANE);
-    vmlaq_s32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmlaq_s32(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Vector multiply accumulate with scalar
@@ -3723,7 +3723,7 @@ pub unsafe fn vmlaq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c: in
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmla_lane_u16<const LANE: i32>(a: uint16x4_t, b: uint16x4_t, c: uint16x4_t) -> uint16x4_t {
    static_assert_imm2!(LANE);
-    vmla_u16(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmla_u16(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Vector multiply accumulate with scalar
@@ -3735,7 +3735,7 @@ pub unsafe fn vmla_lane_u16<const LANE: i32>(a: uint16x4_t, b: uint16x4_t, c: ui
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmla_laneq_u16<const LANE: i32>(a: uint16x4_t, b: uint16x4_t, c: uint16x8_t) -> uint16x4_t {
    static_assert_imm3!(LANE);
-    vmla_u16(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmla_u16(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Vector multiply accumulate with scalar
@@ -3747,7 +3747,7 @@ pub unsafe fn vmla_laneq_u16<const LANE: i32>(a: uint16x4_t, b: uint16x4_t, c: u
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlaq_lane_u16<const LANE: i32>(a: uint16x8_t, b: uint16x8_t, c: uint16x4_t) -> uint16x8_t {
    static_assert_imm2!(LANE);
-    vmlaq_u16(a, b, simd_shuffle8(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmlaq_u16(a, b, simd_shuffle8!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Vector multiply accumulate with scalar
@@ -3759,7 +3759,7 @@ pub unsafe fn vmlaq_lane_u16<const LANE: i32>(a: uint16x8_t, b: uint16x8_t, c: u
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlaq_laneq_u16<const LANE: i32>(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t {
    static_assert_imm3!(LANE);
-    vmlaq_u16(a, b, simd_shuffle8(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmlaq_u16(a, b, simd_shuffle8!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Vector multiply accumulate with scalar
@@ -3771,7 +3771,7 @@ pub unsafe fn vmlaq_laneq_u16<const LANE: i32>(a: uint16x8_t, b: uint16x8_t, c:
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmla_lane_u32<const LANE: i32>(a: uint32x2_t, b: uint32x2_t, c: uint32x2_t) -> uint32x2_t {
    static_assert_imm1!(LANE);
-    vmla_u32(a, b, simd_shuffle2(c, c, [LANE as u32, LANE as u32]))
+    vmla_u32(a, b, simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]))
 }

 /// Vector multiply accumulate with scalar
@@ -3783,7 +3783,7 @@ pub unsafe fn vmla_lane_u32<const LANE: i32>(a: uint32x2_t, b: uint32x2_t, c: ui
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmla_laneq_u32<const LANE: i32>(a: uint32x2_t, b: uint32x2_t, c: uint32x4_t) -> uint32x2_t {
    static_assert_imm2!(LANE);
-    vmla_u32(a, b, simd_shuffle2(c, c, [LANE as u32, LANE as u32]))
+    vmla_u32(a, b, simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]))
 }

 /// Vector multiply accumulate with scalar
@@ -3795,7 +3795,7 @@ pub unsafe fn vmla_laneq_u32<const LANE: i32>(a: uint32x2_t, b: uint32x2_t, c: u
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlaq_lane_u32<const LANE: i32>(a: uint32x4_t, b: uint32x4_t, c: uint32x2_t) -> uint32x4_t {
    static_assert_imm1!(LANE);
-    vmlaq_u32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmlaq_u32(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Vector multiply accumulate with scalar
@@ -3807,7 +3807,7 @@ pub unsafe fn vmlaq_lane_u32<const LANE: i32>(a: uint32x4_t, b: uint32x4_t, c: u
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlaq_laneq_u32<const LANE: i32>(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t {
    static_assert_imm2!(LANE);
-    vmlaq_u32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmlaq_u32(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Vector multiply accumulate with scalar
@@ -3819,7 +3819,7 @@ pub unsafe fn vmlaq_laneq_u32<const LANE: i32>(a: uint32x4_t, b: uint32x4_t, c:
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmla_lane_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
    static_assert_imm1!(LANE);
-    vmla_f32(a, b, simd_shuffle2(c, c, [LANE as u32, LANE as u32]))
+    vmla_f32(a, b, simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]))
 }

 /// Vector multiply accumulate with scalar
@@ -3831,7 +3831,7 @@ pub unsafe fn vmla_lane_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t, c:
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmla_laneq_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t, c: float32x4_t) -> float32x2_t {
    static_assert_imm2!(LANE);
-    vmla_f32(a, b, simd_shuffle2(c, c, [LANE as u32, LANE as u32]))
+    vmla_f32(a, b, simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]))
 }

 /// Vector multiply accumulate with scalar
@@ -3843,7 +3843,7 @@ pub unsafe fn vmla_laneq_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t, c:
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlaq_lane_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t, c: float32x2_t) -> float32x4_t {
    static_assert_imm1!(LANE);
-    vmlaq_f32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmlaq_f32(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Vector multiply accumulate with scalar
@@ -3855,7 +3855,7 @@ pub unsafe fn vmlaq_lane_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t, c:
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlaq_laneq_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
    static_assert_imm2!(LANE);
-    vmlaq_f32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmlaq_f32(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Signed multiply-add long
@@ -3967,7 +3967,7 @@ pub unsafe fn vmlal_n_u32(a: uint64x2_t, b: uint32x2_t, c: u32) -> uint64x2_t {
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlal_lane_s16<const LANE: i32>(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
    static_assert_imm2!(LANE);
-    vmlal_s16(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmlal_s16(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Vector widening multiply accumulate with scalar
@@ -3979,7 +3979,7 @@ pub unsafe fn vmlal_lane_s16<const LANE: i32>(a: int32x4_t, b: int16x4_t, c: int
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlal_laneq_s16<const LANE: i32>(a: int32x4_t, b: int16x4_t, c: int16x8_t) -> int32x4_t {
    static_assert_imm3!(LANE);
-    vmlal_s16(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmlal_s16(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Vector widening multiply accumulate with scalar
@@ -3991,7 +3991,7 @@ pub unsafe fn vmlal_laneq_s16<const LANE: i32>(a: int32x4_t, b: int16x4_t, c: in
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlal_lane_s32<const LANE: i32>(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
    static_assert_imm1!(LANE);
-    vmlal_s32(a, b, simd_shuffle2(c, c, [LANE as u32, LANE as u32]))
+    vmlal_s32(a, b, simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]))
 }

 /// Vector widening multiply accumulate with scalar
@@ -4003,7 +4003,7 @@ pub unsafe fn vmlal_lane_s32<const LANE: i32>(a: int64x2_t, b: int32x2_t, c: int
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlal_laneq_s32<const LANE: i32>(a: int64x2_t, b: int32x2_t, c: int32x4_t) -> int64x2_t {
    static_assert_imm2!(LANE);
-    vmlal_s32(a, b, simd_shuffle2(c, c, [LANE as u32, LANE as u32]))
+    vmlal_s32(a, b, simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]))
 }

 /// Vector widening multiply accumulate with scalar
@@ -4015,7 +4015,7 @@ pub unsafe fn vmlal_laneq_s32<const LANE: i32>(a: int64x2_t, b: int32x2_t, c: in
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlal_lane_u16<const LANE: i32>(a: uint32x4_t, b: uint16x4_t, c: uint16x4_t) -> uint32x4_t {
    static_assert_imm2!(LANE);
-    vmlal_u16(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmlal_u16(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Vector widening multiply accumulate with scalar
@@ -4027,7 +4027,7 @@ pub unsafe fn vmlal_lane_u16<const LANE: i32>(a: uint32x4_t, b: uint16x4_t, c: u
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlal_laneq_u16<const LANE: i32>(a: uint32x4_t, b: uint16x4_t, c: uint16x8_t) -> uint32x4_t {
    static_assert_imm3!(LANE);
-    vmlal_u16(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmlal_u16(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Vector widening multiply accumulate with scalar
@@ -4039,7 +4039,7 @@ pub unsafe fn vmlal_laneq_u16<const LANE: i32>(a: uint32x4_t, b: uint16x4_t, c:
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlal_lane_u32<const LANE: i32>(a: uint64x2_t, b: uint32x2_t, c: uint32x2_t) -> uint64x2_t {
    static_assert_imm1!(LANE);
-    vmlal_u32(a, b, simd_shuffle2(c, c, [LANE as u32, LANE as u32]))
+    vmlal_u32(a, b, simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]))
 }

 /// Vector widening multiply accumulate with scalar
@@ -4051,7 +4051,7 @@ pub unsafe fn vmlal_lane_u32<const LANE: i32>(a: uint64x2_t, b: uint32x2_t, c: u
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlal_laneq_u32<const LANE: i32>(a: uint64x2_t, b: uint32x2_t, c: uint32x4_t) -> uint64x2_t {
    static_assert_imm2!(LANE);
-    vmlal_u32(a, b, simd_shuffle2(c, c, [LANE as u32, LANE as u32]))
+    vmlal_u32(a, b, simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]))
 }

 /// Multiply-subtract from accumulator
@@ -4303,7 +4303,7 @@ pub unsafe fn vmlsq_n_f32(a: float32x4_t, b: float32x4_t, c: f32) -> float32x4_t
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmls_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t {
    static_assert_imm2!(LANE);
-    vmls_s16(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmls_s16(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Vector multiply subtract with scalar
@@ -4315,7 +4315,7 @@ pub unsafe fn vmls_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int1
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmls_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int16x8_t) -> int16x4_t {
    static_assert_imm3!(LANE);
-    vmls_s16(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmls_s16(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Vector multiply subtract with scalar
@@ -4327,7 +4327,7 @@ pub unsafe fn vmls_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlsq_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: int16x4_t) -> int16x8_t {
    static_assert_imm2!(LANE);
-    vmlsq_s16(a, b, simd_shuffle8(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmlsq_s16(a, b, simd_shuffle8!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Vector multiply subtract with scalar
@@ -4339,7 +4339,7 @@ pub unsafe fn vmlsq_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: int
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlsq_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
    static_assert_imm3!(LANE);
-    vmlsq_s16(a, b, simd_shuffle8(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmlsq_s16(a, b, simd_shuffle8!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Vector multiply subtract with scalar
@@ -4351,7 +4351,7 @@ pub unsafe fn vmlsq_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: in
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmls_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t {
    static_assert_imm1!(LANE);
-    vmls_s32(a, b, simd_shuffle2(c, c, [LANE as u32, LANE as u32]))
+    vmls_s32(a, b, simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]))
 }

 /// Vector multiply subtract with scalar
@@ -4363,7 +4363,7 @@ pub unsafe fn vmls_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int3
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmls_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int32x4_t) -> int32x2_t {
    static_assert_imm2!(LANE);
-    vmls_s32(a, b, simd_shuffle2(c, c, [LANE as u32, LANE as u32]))
+    vmls_s32(a, b, simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]))
 }

 /// Vector multiply subtract with scalar
@@ -4375,7 +4375,7 @@ pub unsafe fn vmls_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlsq_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c: int32x2_t) -> int32x4_t {
    static_assert_imm1!(LANE);
-    vmlsq_s32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmlsq_s32(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Vector multiply subtract with scalar
@@ -4387,7 +4387,7 @@ pub unsafe fn vmlsq_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c: int
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlsq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
    static_assert_imm2!(LANE);
-    vmlsq_s32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmlsq_s32(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Vector multiply subtract with scalar
@@ -4399,7 +4399,7 @@ pub unsafe fn vmlsq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c: in
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmls_lane_u16<const LANE: i32>(a: uint16x4_t, b: uint16x4_t, c: uint16x4_t) -> uint16x4_t {
    static_assert_imm2!(LANE);
-    vmls_u16(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmls_u16(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Vector multiply subtract with scalar
@@ -4411,7 +4411,7 @@ pub unsafe fn vmls_lane_u16<const LANE: i32>(a: uint16x4_t, b: uint16x4_t, c: ui
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmls_laneq_u16<const LANE: i32>(a: uint16x4_t, b: uint16x4_t, c: uint16x8_t) -> uint16x4_t {
    static_assert_imm3!(LANE);
-    vmls_u16(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmls_u16(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Vector multiply subtract with scalar
@@ -4423,7 +4423,7 @@ pub unsafe fn vmls_laneq_u16<const LANE: i32>(a: uint16x4_t, b: uint16x4_t, c: u
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlsq_lane_u16<const LANE: i32>(a: uint16x8_t, b: uint16x8_t, c: uint16x4_t) -> uint16x8_t {
    static_assert_imm2!(LANE);
-    vmlsq_u16(a, b, simd_shuffle8(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmlsq_u16(a, b, simd_shuffle8!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Vector multiply subtract with scalar
@@ -4435,7 +4435,7 @@ pub unsafe fn vmlsq_lane_u16<const LANE: i32>(a: uint16x8_t, b: uint16x8_t, c: u
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlsq_laneq_u16<const LANE: i32>(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t {
    static_assert_imm3!(LANE);
-    vmlsq_u16(a, b, simd_shuffle8(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmlsq_u16(a, b, simd_shuffle8!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Vector multiply subtract with scalar
@@ -4447,7 +4447,7 @@ pub unsafe fn vmlsq_laneq_u16<const LANE: i32>(a: uint16x8_t, b: uint16x8_t, c:
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmls_lane_u32<const LANE: i32>(a: uint32x2_t, b: uint32x2_t, c: uint32x2_t) -> uint32x2_t {
    static_assert_imm1!(LANE);
-    vmls_u32(a, b, simd_shuffle2(c, c, [LANE as u32, LANE as u32]))
+    vmls_u32(a, b, simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]))
 }

 /// Vector multiply subtract with scalar
@@ -4459,7 +4459,7 @@ pub unsafe fn vmls_lane_u32<const LANE: i32>(a: uint32x2_t, b: uint32x2_t, c: ui
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmls_laneq_u32<const LANE: i32>(a: uint32x2_t, b: uint32x2_t, c: uint32x4_t) -> uint32x2_t {
    static_assert_imm2!(LANE);
-    vmls_u32(a, b, simd_shuffle2(c, c, [LANE as u32, LANE as u32]))
+    vmls_u32(a, b, simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]))
 }

 /// Vector multiply subtract with scalar
@@ -4471,7 +4471,7 @@ pub unsafe fn vmls_laneq_u32<const LANE: i32>(a: uint32x2_t, b: uint32x2_t, c: u
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlsq_lane_u32<const LANE: i32>(a: uint32x4_t, b: uint32x4_t, c: uint32x2_t) -> uint32x4_t {
    static_assert_imm1!(LANE);
-    vmlsq_u32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmlsq_u32(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Vector multiply subtract with scalar
@@ -4483,7 +4483,7 @@ pub unsafe fn vmlsq_lane_u32<const LANE: i32>(a: uint32x4_t, b: uint32x4_t, c: u
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlsq_laneq_u32<const LANE: i32>(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t {
    static_assert_imm2!(LANE);
-    vmlsq_u32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmlsq_u32(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Vector multiply subtract with scalar
@@ -4495,7 +4495,7 @@ pub unsafe fn vmlsq_laneq_u32<const LANE: i32>(a: uint32x4_t, b: uint32x4_t, c:
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmls_lane_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
    static_assert_imm1!(LANE);
-    vmls_f32(a, b, simd_shuffle2(c, c, [LANE as u32, LANE as u32]))
+    vmls_f32(a, b, simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]))
 }

 /// Vector multiply subtract with scalar
@@ -4507,7 +4507,7 @@ pub unsafe fn vmls_lane_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t, c:
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmls_laneq_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t, c: float32x4_t) -> float32x2_t {
    static_assert_imm2!(LANE);
-    vmls_f32(a, b, simd_shuffle2(c, c, [LANE as u32, LANE as u32]))
+    vmls_f32(a, b, simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]))
 }

 /// Vector multiply subtract with scalar
@@ -4519,7 +4519,7 @@ pub unsafe fn vmls_laneq_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t, c:
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlsq_lane_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t, c: float32x2_t) -> float32x4_t {
    static_assert_imm1!(LANE);
-    vmlsq_f32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmlsq_f32(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Vector multiply subtract with scalar
@@ -4531,7 +4531,7 @@ pub unsafe fn vmlsq_lane_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t, c:
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlsq_laneq_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
    static_assert_imm2!(LANE);
-    vmlsq_f32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmlsq_f32(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Signed multiply-subtract long
@@ -4643,7 +4643,7 @@ pub unsafe fn vmlsl_n_u32(a: uint64x2_t, b: uint32x2_t, c: u32) -> uint64x2_t {
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlsl_lane_s16<const LANE: i32>(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
    static_assert_imm2!(LANE);
-    vmlsl_s16(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmlsl_s16(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Vector widening multiply subtract with scalar
@@ -4655,7 +4655,7 @@ pub unsafe fn vmlsl_lane_s16<const LANE: i32>(a: int32x4_t, b: int16x4_t, c: int
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlsl_laneq_s16<const LANE: i32>(a: int32x4_t, b: int16x4_t, c: int16x8_t) -> int32x4_t {
    static_assert_imm3!(LANE);
-    vmlsl_s16(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmlsl_s16(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Vector widening multiply subtract with scalar
@@ -4667,7 +4667,7 @@ pub unsafe fn vmlsl_laneq_s16<const LANE: i32>(a: int32x4_t, b: int16x4_t, c: in
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlsl_lane_s32<const LANE: i32>(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
    static_assert_imm1!(LANE);
-    vmlsl_s32(a, b, simd_shuffle2(c, c, [LANE as u32, LANE as u32]))
+    vmlsl_s32(a, b, simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]))
 }

 /// Vector widening multiply subtract with scalar
@@ -4679,7 +4679,7 @@ pub unsafe fn vmlsl_lane_s32<const LANE: i32>(a: int64x2_t, b: int32x2_t, c: int
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlsl_laneq_s32<const LANE: i32>(a: int64x2_t, b: int32x2_t, c: int32x4_t) -> int64x2_t {
    static_assert_imm2!(LANE);
-    vmlsl_s32(a, b, simd_shuffle2(c, c, [LANE as u32, LANE as u32]))
+    vmlsl_s32(a, b, simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]))
 }

 /// Vector widening multiply subtract with scalar
@@ -4691,7 +4691,7 @@ pub unsafe fn vmlsl_laneq_s32<const LANE: i32>(a: int64x2_t, b: int32x2_t, c: in
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlsl_lane_u16<const LANE: i32>(a: uint32x4_t, b: uint16x4_t, c: uint16x4_t) -> uint32x4_t {
    static_assert_imm2!(LANE);
-    vmlsl_u16(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmlsl_u16(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Vector widening multiply subtract with scalar
@@ -4703,7 +4703,7 @@ pub unsafe fn vmlsl_lane_u16<const LANE: i32>(a: uint32x4_t, b: uint16x4_t, c: u
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlsl_laneq_u16<const LANE: i32>(a: uint32x4_t, b: uint16x4_t, c: uint16x8_t) -> uint32x4_t {
    static_assert_imm3!(LANE);
-    vmlsl_u16(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmlsl_u16(a, b, simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Vector widening multiply subtract with scalar
@@ -4715,7 +4715,7 @@ pub unsafe fn vmlsl_laneq_u16<const LANE: i32>(a: uint32x4_t, b: uint16x4_t, c:
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlsl_lane_u32<const LANE: i32>(a: uint64x2_t, b: uint32x2_t, c: uint32x2_t) -> uint64x2_t {
    static_assert_imm1!(LANE);
-    vmlsl_u32(a, b, simd_shuffle2(c, c, [LANE as u32, LANE as u32]))
+    vmlsl_u32(a, b, simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]))
 }

 /// Vector widening multiply subtract with scalar
@@ -4727,7 +4727,7 @@ pub unsafe fn vmlsl_lane_u32<const LANE: i32>(a: uint64x2_t, b: uint32x2_t, c: u
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vmlsl_laneq_u32<const LANE: i32>(a: uint64x2_t, b: uint32x2_t, c: uint32x4_t) -> uint64x2_t {
    static_assert_imm2!(LANE);
-    vmlsl_u32(a, b, simd_shuffle2(c, c, [LANE as u32, LANE as u32]))
+    vmlsl_u32(a, b, simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]))
 }

 /// Negate
@@ -6115,7 +6115,7 @@ pub unsafe fn vmulq_n_f32(a: float32x4_t, b: f32) -> float32x4_t {
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vmul_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
    static_assert_imm2!(LANE);
-    simd_mul(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    simd_mul(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Multiply
@@ -6127,7 +6127,7 @@ pub unsafe fn vmul_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t) -> int1
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vmul_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x8_t) -> int16x4_t {
    static_assert_imm3!(LANE);
-    simd_mul(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    simd_mul(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Multiply
@@ -6139,7 +6139,7 @@ pub unsafe fn vmul_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x8_t) -> int
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vmulq_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x4_t) -> int16x8_t {
    static_assert_imm2!(LANE);
-    simd_mul(a, simd_shuffle8(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    simd_mul(a, simd_shuffle8!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Multiply
@@ -6151,7 +6151,7 @@ pub unsafe fn vmulq_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x4_t) -> int
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vmulq_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t) -> int16x8_t {
    static_assert_imm3!(LANE);
-    simd_mul(a, simd_shuffle8(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    simd_mul(a, simd_shuffle8!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Multiply
@@ -6163,7 +6163,7 @@ pub unsafe fn vmulq_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t) -> in
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vmul_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
    static_assert_imm1!(LANE);
-    simd_mul(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32]))
+    simd_mul(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
 }

 /// Multiply
@@ -6175,7 +6175,7 @@ pub unsafe fn vmul_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t) -> int3
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vmul_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x4_t) -> int32x2_t {
    static_assert_imm2!(LANE);
-    simd_mul(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32]))
+    simd_mul(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
 }

 /// Multiply
@@ -6187,7 +6187,7 @@ pub unsafe fn vmul_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x4_t) -> int
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vmulq_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x2_t) -> int32x4_t {
    static_assert_imm1!(LANE);
-    simd_mul(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    simd_mul(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Multiply
@@ -6199,7 +6199,7 @@ pub unsafe fn vmulq_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x2_t) -> int
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vmulq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t) -> int32x4_t {
    static_assert_imm2!(LANE);
-    simd_mul(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    simd_mul(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Multiply
@@ -6211,7 +6211,7 @@ pub unsafe fn vmulq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t) -> in
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vmul_lane_u16<const LANE: i32>(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
    static_assert_imm2!(LANE);
-    simd_mul(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    simd_mul(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Multiply
@@ -6223,7 +6223,7 @@ pub unsafe fn vmul_lane_u16<const LANE: i32>(a: uint16x4_t, b: uint16x4_t) -> ui
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vmul_laneq_u16<const LANE: i32>(a: uint16x4_t, b: uint16x8_t) -> uint16x4_t {
    static_assert_imm3!(LANE);
-    simd_mul(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    simd_mul(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Multiply
@@ -6235,7 +6235,7 @@ pub unsafe fn vmul_laneq_u16<const LANE: i32>(a: uint16x4_t, b: uint16x8_t) -> u
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vmulq_lane_u16<const LANE: i32>(a: uint16x8_t, b: uint16x4_t) -> uint16x8_t {
    static_assert_imm2!(LANE);
-    simd_mul(a, simd_shuffle8(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    simd_mul(a, simd_shuffle8!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Multiply
@@ -6247,7 +6247,7 @@ pub unsafe fn vmulq_lane_u16<const LANE: i32>(a: uint16x8_t, b: uint16x4_t) -> u
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vmulq_laneq_u16<const LANE: i32>(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
    static_assert_imm3!(LANE);
-    simd_mul(a, simd_shuffle8(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    simd_mul(a, simd_shuffle8!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Multiply
@@ -6259,7 +6259,7 @@ pub unsafe fn vmulq_laneq_u16<const LANE: i32>(a: uint16x8_t, b: uint16x8_t) ->
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vmul_lane_u32<const LANE: i32>(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
    static_assert_imm1!(LANE);
-    simd_mul(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32]))
+    simd_mul(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
 }

 /// Multiply
@@ -6271,7 +6271,7 @@ pub unsafe fn vmul_lane_u32<const LANE: i32>(a: uint32x2_t, b: uint32x2_t) -> ui
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vmul_laneq_u32<const LANE: i32>(a: uint32x2_t, b: uint32x4_t) -> uint32x2_t {
    static_assert_imm2!(LANE);
-    simd_mul(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32]))
+    simd_mul(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
 }

 /// Multiply
@@ -6283,7 +6283,7 @@ pub unsafe fn vmul_laneq_u32<const LANE: i32>(a: uint32x2_t, b: uint32x4_t) -> u
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vmulq_lane_u32<const LANE: i32>(a: uint32x4_t, b: uint32x2_t) -> uint32x4_t {
    static_assert_imm1!(LANE);
-    simd_mul(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    simd_mul(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Multiply
@@ -6295,7 +6295,7 @@ pub unsafe fn vmulq_lane_u32<const LANE: i32>(a: uint32x4_t, b: uint32x2_t) -> u
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vmulq_laneq_u32<const LANE: i32>(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
    static_assert_imm2!(LANE);
-    simd_mul(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    simd_mul(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Floating-point multiply
@@ -6307,7 +6307,7 @@ pub unsafe fn vmulq_laneq_u32<const LANE: i32>(a: uint32x4_t, b: uint32x4_t) ->
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vmul_lane_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t) -> float32x2_t {
    static_assert_imm1!(LANE);
-    simd_mul(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32]))
+    simd_mul(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
 }

 /// Floating-point multiply
@@ -6319,7 +6319,7 @@ pub unsafe fn vmul_lane_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t) ->
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vmul_laneq_f32<const LANE: i32>(a: float32x2_t, b: float32x4_t) -> float32x2_t {
    static_assert_imm2!(LANE);
-    simd_mul(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32]))
+    simd_mul(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
 }

 /// Floating-point multiply
@@ -6331,7 +6331,7 @@ pub unsafe fn vmul_laneq_f32<const LANE: i32>(a: float32x2_t, b: float32x4_t) ->
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vmulq_lane_f32<const LANE: i32>(a: float32x4_t, b: float32x2_t) -> float32x4_t {
    static_assert_imm1!(LANE);
-    simd_mul(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    simd_mul(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Floating-point multiply
@@ -6343,7 +6343,7 @@ pub unsafe fn vmulq_lane_f32<const LANE: i32>(a: float32x4_t, b: float32x2_t) ->
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vmulq_laneq_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t) -> float32x4_t {
    static_assert_imm2!(LANE);
-    simd_mul(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    simd_mul(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Signed multiply long
@@ -6507,7 +6507,7 @@ pub unsafe fn vmulls_n_u32(a: uint32x2_t, b: u32) -> uint64x2_t {
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vmull_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t) -> int32x4_t {
    static_assert_imm2!(LANE);
-    vmull_s16(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmull_s16(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Vector long multiply by scalar
@@ -6519,7 +6519,7 @@ pub unsafe fn vmull_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t) -> int
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vmull_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x8_t) -> int32x4_t {
    static_assert_imm3!(LANE);
-    vmull_s16(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmull_s16(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Vector long multiply by scalar
@@ -6531,7 +6531,7 @@ pub unsafe fn vmull_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x8_t) -> in
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vmull_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t) -> int64x2_t {
    static_assert_imm1!(LANE);
-    vmull_s32(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32]))
+    vmull_s32(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
 }

 /// Vector long multiply by scalar
@@ -6543,7 +6543,7 @@ pub unsafe fn vmull_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t) -> int
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vmull_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x4_t) -> int64x2_t {
    static_assert_imm2!(LANE);
-    vmull_s32(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32]))
+    vmull_s32(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
 }

 /// Vector long multiply by scalar
@@ -6555,7 +6555,7 @@ pub unsafe fn vmull_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x4_t) -> in
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vmull_lane_u16<const LANE: i32>(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t {
    static_assert_imm2!(LANE);
-    vmull_u16(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmull_u16(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Vector long multiply by scalar
@@ -6567,7 +6567,7 @@ pub unsafe fn vmull_lane_u16<const LANE: i32>(a: uint16x4_t, b: uint16x4_t) -> u
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vmull_laneq_u16<const LANE: i32>(a: uint16x4_t, b: uint16x8_t) -> uint32x4_t {
    static_assert_imm3!(LANE);
-    vmull_u16(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+    vmull_u16(a, simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }

 /// Vector long multiply by scalar
@@ -6579,7 +6579,7 @@ pub unsafe fn vmull_laneq_u16<const LANE: i32>(a: uint16x4_t, b: uint16x8_t) ->
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vmull_lane_u32<const LANE: i32>(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t {
    static_assert_imm1!(LANE);
-    vmull_u32(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32]))
+    vmull_u32(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
 }

 /// Vector long multiply by scalar
@@ -6591,7 +6591,7 @@ pub unsafe fn vmull_lane_u32<const LANE: i32>(a: uint32x2_t, b: uint32x2_t) -> u
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vmull_laneq_u32<const LANE: i32>(a: uint32x2_t, b: uint32x4_t) -> uint64x2_t {
    static_assert_imm2!(LANE);
-    vmull_u32(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32]))
+    vmull_u32(a, simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]))
 }

 /// Floating-point fused Multiply-Add to accumulator(vector)
@@ -6902,7 +6902,7 @@ pub unsafe fn vsubhn_u64(a: uint64x2_t, b: uint64x2_t) -> uint32x2_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn2))]
 pub unsafe fn vsubhn_high_s16(a: int8x8_t, b: int16x8_t, c: int16x8_t) -> int8x16_t {
    let d: int8x8_t = vsubhn_s16(b, c);
-    simd_shuffle16(a, d, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+    simd_shuffle16!(a, d, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
 }

 /// Subtract returning high narrow
@@ -6913,7 +6913,7 @@ pub unsafe fn vsubhn_high_s16(a: int8x8_t, b: int16x8_t, c: int16x8_t) -> int8x1
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn2))]
 pub unsafe fn vsubhn_high_s32(a: int16x4_t, b: int32x4_t, c: int32x4_t) -> int16x8_t {
    let d: int16x4_t = vsubhn_s32(b, c);
-    simd_shuffle8(a, d, [0, 1, 2, 3, 4, 5, 6, 7])
+    simd_shuffle8!(a, d, [0, 1, 2, 3, 4, 5, 6, 7])
 }

 /// Subtract returning high narrow
@@ -6924,7 +6924,7 @@ pub unsafe fn vsubhn_high_s32(a: int16x4_t, b: int32x4_t, c: int32x4_t) -> int16
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn2))]
 pub unsafe fn vsubhn_high_s64(a: int32x2_t, b: int64x2_t, c: int64x2_t) -> int32x4_t {
    let d: int32x2_t = vsubhn_s64(b, c);
-    simd_shuffle4(a, d, [0, 1, 2, 3])
+    simd_shuffle4!(a, d, [0, 1, 2, 3])
 }

 /// Subtract returning high narrow
@@ -6935,7 +6935,7 @@ pub unsafe fn vsubhn_high_s64(a: int32x2_t, b: int64x2_t, c: int64x2_t) -> int32
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn2))]
 pub unsafe fn vsubhn_high_u16(a: uint8x8_t, b: uint16x8_t, c: uint16x8_t) -> uint8x16_t {
    let d: uint8x8_t = vsubhn_u16(b, c);
-    simd_shuffle16(a, d, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+    simd_shuffle16!(a, d, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
 }

 /// Subtract returning high narrow
@@ -6946,7 +6946,7 @@ pub unsafe fn vsubhn_high_u16(a: uint8x8_t, b: uint16x8_t, c: uint16x8_t) -> uin
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn2))]
 pub unsafe fn vsubhn_high_u32(a: uint16x4_t, b: uint32x4_t, c: uint32x4_t) -> uint16x8_t {
    let d: uint16x4_t = vsubhn_u32(b, c);
-    simd_shuffle8(a, d, [0, 1, 2, 3, 4, 5, 6, 7])
+    simd_shuffle8!(a, d, [0, 1, 2, 3, 4, 5, 6, 7])
 }

 /// Subtract returning high narrow
@@ -6957,7 +6957,7 @@ pub unsafe fn vsubhn_high_u32(a: uint16x4_t, b: uint32x4_t, c: uint32x4_t) -> ui
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn2))]
 pub unsafe fn vsubhn_high_u64(a: uint32x2_t, b: uint64x2_t, c: uint64x2_t) -> uint32x4_t {
    let d: uint32x2_t = vsubhn_u64(b, c);
-    simd_shuffle4(a, d, [0, 1, 2, 3])
+    simd_shuffle4!(a, d, [0, 1, 2, 3])
 }

 /// Signed halving subtract
@@ -7857,7 +7857,7 @@ pub unsafe fn vqdmull_n_s32(a: int32x2_t, b: i32) -> int64x2_t {
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vqdmull_lane_s16<const N: i32>(a: int16x4_t, b: int16x4_t) -> int32x4_t {
    static_assert_imm2!(N);
-    let b: int16x4_t = simd_shuffle4(b, b, [N as u32, N as u32, N as u32, N as u32]);
+    let b: int16x4_t = simd_shuffle4!(b, b, <const N: i32> [N as u32, N as u32, N as u32, N as u32]);
    vqdmull_s16(a, b)
 }

@@ -7870,7 +7870,7 @@ pub unsafe fn vqdmull_lane_s16<const N: i32>(a: int16x4_t, b: int16x4_t) -> int3
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vqdmull_lane_s32<const N: i32>(a: int32x2_t, b: int32x2_t) -> int64x2_t {
    static_assert_imm1!(N);
-    let b: int32x2_t = simd_shuffle2(b, b, [N as u32, N as u32]);
+    let b: int32x2_t = simd_shuffle2!(b, b, <const N: i32> [N as u32, N as u32]);
    vqdmull_s32(a, b)
 }

@@ -8223,7 +8223,7 @@ pub unsafe fn vqrdmulhq_n_s32(a: int32x4_t, b: i32) -> int32x4_t {
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vqrdmulh_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
    static_assert_imm2!(LANE);
-    let b: int16x4_t = simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+    let b: int16x4_t = simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
    vqrdmulh_s16(a, b)
 }

@@ -8236,7 +8236,7 @@ pub unsafe fn vqrdmulh_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t) ->
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vqrdmulh_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x8_t) -> int16x4_t {
    static_assert_imm3!(LANE);
-    let b: int16x4_t = simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+    let b: int16x4_t = simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
    vqrdmulh_s16(a, b)
 }

@@ -8249,7 +8249,7 @@ pub unsafe fn vqrdmulh_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x8_t) ->
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vqrdmulhq_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x4_t) -> int16x8_t {
    static_assert_imm2!(LANE);
-    let b: int16x8_t = simd_shuffle8(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+    let b: int16x8_t = simd_shuffle8!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
    vqrdmulhq_s16(a, b)
 }

@@ -8262,7 +8262,7 @@ pub unsafe fn vqrdmulhq_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x4_t) ->
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vqrdmulhq_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t) -> int16x8_t {
    static_assert_imm3!(LANE);
-    let b: int16x8_t = simd_shuffle8(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+    let b: int16x8_t = simd_shuffle8!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
    vqrdmulhq_s16(a, b)
 }

@@ -8275,7 +8275,7 @@ pub unsafe fn vqrdmulhq_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t) -
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vqrdmulh_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
    static_assert_imm1!(LANE);
-    let b: int32x2_t = simd_shuffle2(b, b, [LANE as u32, LANE as u32]);
+    let b: int32x2_t = simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]);
    vqrdmulh_s32(a, b)
 }

@@ -8288,7 +8288,7 @@ pub unsafe fn vqrdmulh_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t) ->
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vqrdmulh_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x4_t) -> int32x2_t {
    static_assert_imm2!(LANE);
-    let b: int32x2_t = simd_shuffle2(b, b, [LANE as u32, LANE as u32]);
+    let b: int32x2_t = simd_shuffle2!(b, b, <const LANE: i32> [LANE as u32, LANE as u32]);
    vqrdmulh_s32(a, b)
 }

@@ -8301,7 +8301,7 @@ pub unsafe fn vqrdmulh_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x4_t) ->
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vqrdmulhq_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x2_t) -> int32x4_t {
    static_assert_imm1!(LANE);
-    let b: int32x4_t = simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+    let b: int32x4_t = simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
    vqrdmulhq_s32(a, b)
 }

@@ -8314,7 +8314,7 @@ pub unsafe fn vqrdmulhq_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x2_t) ->
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vqrdmulhq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t) -> int32x4_t {
    static_assert_imm2!(LANE);
-    let b: int32x4_t = simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+    let b: int32x4_t = simd_shuffle4!(b, b, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
    vqrdmulhq_s32(a, b)
 }

@@ -580,7 +580,7 @@ pub unsafe fn vld1q_lane_f32<const LANE: i32>(ptr: *const f32, src: float32x4_t)
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
 pub unsafe fn vld1_dup_s8(ptr: *const i8) -> int8x8_t {
    let x = vld1_lane_s8::<0>(ptr, transmute(i8x8::splat(0)));
-    simd_shuffle8(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
+    simd_shuffle8!(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
 }

 /// Load one single-element structure and Replicate to all lanes (of one register).
@@ -591,7 +591,7 @@ pub unsafe fn vld1_dup_s8(ptr: *const i8) -> int8x8_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
 pub unsafe fn vld1q_dup_s8(ptr: *const i8) -> int8x16_t {
    let x = vld1q_lane_s8::<0>(ptr, transmute(i8x16::splat(0)));
-    simd_shuffle16(x, x, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+    simd_shuffle16!(x, x, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
 }

 /// Load one single-element structure and Replicate to all lanes (of one register).
@@ -602,7 +602,7 @@ pub unsafe fn vld1q_dup_s8(ptr: *const i8) -> int8x16_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
 pub unsafe fn vld1_dup_s16(ptr: *const i16) -> int16x4_t {
    let x = vld1_lane_s16::<0>(ptr, transmute(i16x4::splat(0)));
-    simd_shuffle4(x, x, [0, 0, 0, 0])
+    simd_shuffle4!(x, x, [0, 0, 0, 0])
 }

 /// Load one single-element structure and Replicate to all lanes (of one register).
@@ -613,7 +613,7 @@ pub unsafe fn vld1_dup_s16(ptr: *const i16) -> int16x4_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
 pub unsafe fn vld1q_dup_s16(ptr: *const i16) -> int16x8_t {
    let x = vld1q_lane_s16::<0>(ptr, transmute(i16x8::splat(0)));
-    simd_shuffle8(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
+    simd_shuffle8!(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
 }

 /// Load one single-element structure and Replicate to all lanes (of one register).
@@ -624,7 +624,7 @@ pub unsafe fn vld1q_dup_s16(ptr: *const i16) -> int16x8_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
 pub unsafe fn vld1_dup_s32(ptr: *const i32) -> int32x2_t {
    let x = vld1_lane_s32::<0>(ptr, transmute(i32x2::splat(0)));
-    simd_shuffle2(x, x, [0, 0])
+    simd_shuffle2!(x, x, [0, 0])
 }

 /// Load one single-element structure and Replicate to all lanes (of one register).
@@ -635,7 +635,7 @@ pub unsafe fn vld1_dup_s32(ptr: *const i32) -> int32x2_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
 pub unsafe fn vld1q_dup_s32(ptr: *const i32) -> int32x4_t {
    let x = vld1q_lane_s32::<0>(ptr, transmute(i32x4::splat(0)));
-    simd_shuffle4(x, x, [0, 0, 0, 0])
+    simd_shuffle4!(x, x, [0, 0, 0, 0])
 }

 /// Load one single-element structure and Replicate to all lanes (of one register).
@@ -663,7 +663,7 @@ pub unsafe fn vld1_dup_s64(ptr: *const i64) -> int64x1_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
 pub unsafe fn vld1q_dup_s64(ptr: *const i64) -> int64x2_t {
    let x = vld1q_lane_s64::<0>(ptr, transmute(i64x2::splat(0)));
-    simd_shuffle2(x, x, [0, 0])
+    simd_shuffle2!(x, x, [0, 0])
 }

 /// Load one single-element structure and Replicate to all lanes (of one register).
@@ -674,7 +674,7 @@ pub unsafe fn vld1q_dup_s64(ptr: *const i64) -> int64x2_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
 pub unsafe fn vld1_dup_u8(ptr: *const u8) -> uint8x8_t {
    let x = vld1_lane_u8::<0>(ptr, transmute(u8x8::splat(0)));
-    simd_shuffle8(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
+    simd_shuffle8!(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
 }

 /// Load one single-element structure and Replicate to all lanes (of one register).
@@ -685,7 +685,7 @@ pub unsafe fn vld1_dup_u8(ptr: *const u8) -> uint8x8_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
 pub unsafe fn vld1q_dup_u8(ptr: *const u8) -> uint8x16_t {
    let x = vld1q_lane_u8::<0>(ptr, transmute(u8x16::splat(0)));
-    simd_shuffle16(x, x, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+    simd_shuffle16!(x, x, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
 }

 /// Load one single-element structure and Replicate to all lanes (of one register).
@@ -696,7 +696,7 @@ pub unsafe fn vld1q_dup_u8(ptr: *const u8) -> uint8x16_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
 pub unsafe fn vld1_dup_u16(ptr: *const u16) -> uint16x4_t {
    let x = vld1_lane_u16::<0>(ptr, transmute(u16x4::splat(0)));
-    simd_shuffle4(x, x, [0, 0, 0, 0])
+    simd_shuffle4!(x, x, [0, 0, 0, 0])
 }

 /// Load one single-element structure and Replicate to all lanes (of one register).
@@ -707,7 +707,7 @@ pub unsafe fn vld1_dup_u16(ptr: *const u16) -> uint16x4_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
 pub unsafe fn vld1q_dup_u16(ptr: *const u16) -> uint16x8_t {
    let x = vld1q_lane_u16::<0>(ptr, transmute(u16x8::splat(0)));
-    simd_shuffle8(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
+    simd_shuffle8!(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
 }

 /// Load one single-element structure and Replicate to all lanes (of one register).
@@ -718,7 +718,7 @@ pub unsafe fn vld1q_dup_u16(ptr: *const u16) -> uint16x8_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
 pub unsafe fn vld1_dup_u32(ptr: *const u32) -> uint32x2_t {
    let x = vld1_lane_u32::<0>(ptr, transmute(u32x2::splat(0)));
-    simd_shuffle2(x, x, [0, 0])
+    simd_shuffle2!(x, x, [0, 0])
 }

 /// Load one single-element structure and Replicate to all lanes (of one register).
@@ -729,7 +729,7 @@ pub unsafe fn vld1_dup_u32(ptr: *const u32) -> uint32x2_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
 pub unsafe fn vld1q_dup_u32(ptr: *const u32) -> uint32x4_t {
    let x = vld1q_lane_u32::<0>(ptr, transmute(u32x4::splat(0)));
-    simd_shuffle4(x, x, [0, 0, 0, 0])
+    simd_shuffle4!(x, x, [0, 0, 0, 0])
 }

 /// Load one single-element structure and Replicate to all lanes (of one register).
@@ -757,7 +757,7 @@ pub unsafe fn vld1_dup_u64(ptr: *const u64) -> uint64x1_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
 pub unsafe fn vld1q_dup_u64(ptr: *const u64) -> uint64x2_t {
    let x = vld1q_lane_u64::<0>(ptr, transmute(u64x2::splat(0)));
-    simd_shuffle2(x, x, [0, 0])
+    simd_shuffle2!(x, x, [0, 0])
 }

 /// Load one single-element structure and Replicate to all lanes (of one register).
@@ -768,7 +768,7 @@ pub unsafe fn vld1q_dup_u64(ptr: *const u64) -> uint64x2_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
 pub unsafe fn vld1_dup_p8(ptr: *const p8) -> poly8x8_t {
    let x = vld1_lane_p8::<0>(ptr, transmute(u8x8::splat(0)));
-    simd_shuffle8(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
+    simd_shuffle8!(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
 }

 /// Load one single-element structure and Replicate to all lanes (of one register).
@@ -779,7 +779,7 @@ pub unsafe fn vld1_dup_p8(ptr: *const p8) -> poly8x8_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
 pub unsafe fn vld1q_dup_p8(ptr: *const p8) -> poly8x16_t {
    let x = vld1q_lane_p8::<0>(ptr, transmute(u8x16::splat(0)));
-    simd_shuffle16(x, x, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+    simd_shuffle16!(x, x, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
 }

 /// Load one single-element structure and Replicate to all lanes (of one register).
@@ -790,7 +790,7 @@ pub unsafe fn vld1q_dup_p8(ptr: *const p8) -> poly8x16_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
 pub unsafe fn vld1_dup_p16(ptr: *const p16) -> poly16x4_t {
    let x = vld1_lane_p16::<0>(ptr, transmute(u16x4::splat(0)));
-    simd_shuffle4(x, x, [0, 0, 0, 0])
+    simd_shuffle4!(x, x, [0, 0, 0, 0])
 }

 /// Load one single-element structure and Replicate to all lanes (of one register).
@@ -801,7 +801,7 @@ pub unsafe fn vld1_dup_p16(ptr: *const p16) -> poly16x4_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
 pub unsafe fn vld1q_dup_p16(ptr: *const p16) -> poly16x8_t {
    let x = vld1q_lane_p16::<0>(ptr, transmute(u16x8::splat(0)));
-    simd_shuffle8(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
+    simd_shuffle8!(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
 }

 /// Load one single-element structure and Replicate to all lanes (of one register).
@@ -812,7 +812,7 @@ pub unsafe fn vld1q_dup_p16(ptr: *const p16) -> poly16x8_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
 pub unsafe fn vld1_dup_f32(ptr: *const f32) -> float32x2_t {
    let x = vld1_lane_f32::<0>(ptr, transmute(f32x2::splat(0.)));
-    simd_shuffle2(x, x, [0, 0])
+    simd_shuffle2!(x, x, [0, 0])
 }

 /// Load one single-element structure and Replicate to all lanes (of one register).
@@ -823,7 +823,7 @@ pub unsafe fn vld1_dup_f32(ptr: *const f32) -> float32x2_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
 pub unsafe fn vld1q_dup_f32(ptr: *const f32) -> float32x4_t {
    let x = vld1q_lane_f32::<0>(ptr, transmute(f32x4::splat(0.)));
-    simd_shuffle4(x, x, [0, 0, 0, 0])
+    simd_shuffle4!(x, x, [0, 0, 0, 0])
 }

 // signed absolute difference and accumulate (64-bit)
@@ -1284,8 +1284,8 @@ pub unsafe fn vaddl_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddl2))]
 pub unsafe fn vaddl_high_s8(a: int8x16_t, b: int8x16_t) -> int16x8_t {
-    let a: int8x8_t = simd_shuffle8(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
-    let b: int8x8_t = simd_shuffle8(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let a: int8x8_t = simd_shuffle8!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let b: int8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
    let a: int16x8_t = simd_cast(a);
    let b: int16x8_t = simd_cast(b);
    simd_add(a, b)
@@ -1298,8 +1298,8 @@ pub unsafe fn vaddl_high_s8(a: int8x16_t, b: int8x16_t) -> int16x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddl2))]
 pub unsafe fn vaddl_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t {
-    let a: int16x4_t = simd_shuffle4(a, a, [4, 5, 6, 7]);
-    let b: int16x4_t = simd_shuffle4(b, b, [4, 5, 6, 7]);
+    let a: int16x4_t = simd_shuffle4!(a, a, [4, 5, 6, 7]);
+    let b: int16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
    let a: int32x4_t = simd_cast(a);
    let b: int32x4_t = simd_cast(b);
    simd_add(a, b)
@@ -1312,8 +1312,8 @@ pub unsafe fn vaddl_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddl2))]
 pub unsafe fn vaddl_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t {
-    let a: int32x2_t = simd_shuffle2(a, a, [2, 3]);
-    let b: int32x2_t = simd_shuffle2(b, b, [2, 3]);
+    let a: int32x2_t = simd_shuffle2!(a, a, [2, 3]);
+    let b: int32x2_t = simd_shuffle2!(b, b, [2, 3]);
    let a: int64x2_t = simd_cast(a);
    let b: int64x2_t = simd_cast(b);
    simd_add(a, b)
@@ -1326,8 +1326,8 @@ pub unsafe fn vaddl_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddl2))]
 pub unsafe fn vaddl_high_u8(a: uint8x16_t, b: uint8x16_t) -> uint16x8_t {
-    let a: uint8x8_t = simd_shuffle8(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
-    let b: uint8x8_t = simd_shuffle8(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let a: uint8x8_t = simd_shuffle8!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let b: uint8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
    let a: uint16x8_t = simd_cast(a);
    let b: uint16x8_t = simd_cast(b);
    simd_add(a, b)
@@ -1340,8 +1340,8 @@ pub unsafe fn vaddl_high_u8(a: uint8x16_t, b: uint8x16_t) -> uint16x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddl2))]
 pub unsafe fn vaddl_high_u16(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t {
-    let a: uint16x4_t = simd_shuffle4(a, a, [4, 5, 6, 7]);
-    let b: uint16x4_t = simd_shuffle4(b, b, [4, 5, 6, 7]);
+    let a: uint16x4_t = simd_shuffle4!(a, a, [4, 5, 6, 7]);
+    let b: uint16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
    let a: uint32x4_t = simd_cast(a);
    let b: uint32x4_t = simd_cast(b);
    simd_add(a, b)
@@ -1354,8 +1354,8 @@ pub unsafe fn vaddl_high_u16(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddl2))]
 pub unsafe fn vaddl_high_u32(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t {
-    let a: uint32x2_t = simd_shuffle2(a, a, [2, 3]);
-    let b: uint32x2_t = simd_shuffle2(b, b, [2, 3]);
+    let a: uint32x2_t = simd_shuffle2!(a, a, [2, 3]);
+    let b: uint32x2_t = simd_shuffle2!(b, b, [2, 3]);
    let a: uint64x2_t = simd_cast(a);
    let b: uint64x2_t = simd_cast(b);
    simd_add(a, b)
@@ -1434,7 +1434,7 @@ pub unsafe fn vaddw_u32(a: uint64x2_t, b: uint32x2_t) -> uint64x2_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddw2))]
 pub unsafe fn vaddw_high_s8(a: int16x8_t, b: int8x16_t) -> int16x8_t {
-    let b: int8x8_t = simd_shuffle8(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let b: int8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
    let b: int16x8_t = simd_cast(b);
    simd_add(a, b)
 }
@@ -1446,7 +1446,7 @@ pub unsafe fn vaddw_high_s8(a: int16x8_t, b: int8x16_t) -> int16x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddw2))]
 pub unsafe fn vaddw_high_s16(a: int32x4_t, b: int16x8_t) -> int32x4_t {
-    let b: int16x4_t = simd_shuffle4(b, b, [4, 5, 6, 7]);
+    let b: int16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
    let b: int32x4_t = simd_cast(b);
    simd_add(a, b)
 }
@@ -1458,7 +1458,7 @@ pub unsafe fn vaddw_high_s16(a: int32x4_t, b: int16x8_t) -> int32x4_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddw2))]
 pub unsafe fn vaddw_high_s32(a: int64x2_t, b: int32x4_t) -> int64x2_t {
-    let b: int32x2_t = simd_shuffle2(b, b, [2, 3]);
+    let b: int32x2_t = simd_shuffle2!(b, b, [2, 3]);
    let b: int64x2_t = simd_cast(b);
    simd_add(a, b)
 }
@@ -1470,7 +1470,7 @@ pub unsafe fn vaddw_high_s32(a: int64x2_t, b: int32x4_t) -> int64x2_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddw2))]
 pub unsafe fn vaddw_high_u8(a: uint16x8_t, b: uint8x16_t) -> uint16x8_t {
-    let b: uint8x8_t = simd_shuffle8(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let b: uint8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
    let b: uint16x8_t = simd_cast(b);
    simd_add(a, b)
 }
@@ -1482,7 +1482,7 @@ pub unsafe fn vaddw_high_u8(a: uint16x8_t, b: uint8x16_t) -> uint16x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddw2))]
 pub unsafe fn vaddw_high_u16(a: uint32x4_t, b: uint16x8_t) -> uint32x4_t {
-    let b: uint16x4_t = simd_shuffle4(b, b, [4, 5, 6, 7]);
+    let b: uint16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
    let b: uint32x4_t = simd_cast(b);
    simd_add(a, b)
 }
@@ -1494,7 +1494,7 @@ pub unsafe fn vaddw_high_u16(a: uint32x4_t, b: uint16x8_t) -> uint32x4_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddw2))]
 pub unsafe fn vaddw_high_u32(a: uint64x2_t, b: uint32x4_t) -> uint64x2_t {
-    let b: uint32x2_t = simd_shuffle2(b, b, [2, 3]);
+    let b: uint32x2_t = simd_shuffle2!(b, b, [2, 3]);
    let b: uint64x2_t = simd_cast(b);
    simd_add(a, b)
 }
@@ -1567,7 +1567,7 @@ pub unsafe fn vaddhn_u64(a: uint64x2_t, b: uint64x2_t) -> uint32x2_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addhn2))]
 pub unsafe fn vaddhn_high_s16(r: int8x8_t, a: int16x8_t, b: int16x8_t) -> int8x16_t {
    let x = simd_cast(simd_shr(simd_add(a, b), int16x8_t(8, 8, 8, 8, 8, 8, 8, 8)));
-    simd_shuffle16(r, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+    simd_shuffle16!(r, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
 }

 /// Add returning High Narrow (high half).
@@ -1578,7 +1578,7 @@ pub unsafe fn vaddhn_high_s16(r: int8x8_t, a: int16x8_t, b: int16x8_t) -> int8x1
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addhn2))]
 pub unsafe fn vaddhn_high_s32(r: int16x4_t, a: int32x4_t, b: int32x4_t) -> int16x8_t {
    let x = simd_cast(simd_shr(simd_add(a, b), int32x4_t(16, 16, 16, 16)));
-    simd_shuffle8(r, x, [0, 1, 2, 3, 4, 5, 6, 7])
+    simd_shuffle8!(r, x, [0, 1, 2, 3, 4, 5, 6, 7])
 }

 /// Add returning High Narrow (high half).
@@ -1589,7 +1589,7 @@ pub unsafe fn vaddhn_high_s32(r: int16x4_t, a: int32x4_t, b: int32x4_t) -> int16
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addhn2))]
 pub unsafe fn vaddhn_high_s64(r: int32x2_t, a: int64x2_t, b: int64x2_t) -> int32x4_t {
    let x = simd_cast(simd_shr(simd_add(a, b), int64x2_t(32, 32)));
-    simd_shuffle4(r, x, [0, 1, 2, 3])
+    simd_shuffle4!(r, x, [0, 1, 2, 3])
 }

 /// Add returning High Narrow (high half).
@@ -1600,7 +1600,7 @@ pub unsafe fn vaddhn_high_s64(r: int32x2_t, a: int64x2_t, b: int64x2_t) -> int32
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addhn2))]
 pub unsafe fn vaddhn_high_u16(r: uint8x8_t, a: uint16x8_t, b: uint16x8_t) -> uint8x16_t {
    let x = simd_cast(simd_shr(simd_add(a, b), uint16x8_t(8, 8, 8, 8, 8, 8, 8, 8)));
-    simd_shuffle16(r, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+    simd_shuffle16!(r, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
 }

 /// Add returning High Narrow (high half).
@@ -1611,7 +1611,7 @@ pub unsafe fn vaddhn_high_u16(r: uint8x8_t, a: uint16x8_t, b: uint16x8_t) -> uin
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addhn2))]
 pub unsafe fn vaddhn_high_u32(r: uint16x4_t, a: uint32x4_t, b: uint32x4_t) -> uint16x8_t {
    let x = simd_cast(simd_shr(simd_add(a, b), uint32x4_t(16, 16, 16, 16)));
-    simd_shuffle8(r, x, [0, 1, 2, 3, 4, 5, 6, 7])
+    simd_shuffle8!(r, x, [0, 1, 2, 3, 4, 5, 6, 7])
 }

 /// Add returning High Narrow (high half).
@@ -1622,7 +1622,7 @@ pub unsafe fn vaddhn_high_u32(r: uint16x4_t, a: uint32x4_t, b: uint32x4_t) -> ui
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addhn2))]
 pub unsafe fn vaddhn_high_u64(r: uint32x2_t, a: uint64x2_t, b: uint64x2_t) -> uint32x4_t {
    let x = simd_cast(simd_shr(simd_add(a, b), uint64x2_t(32, 32)));
-    simd_shuffle4(r, x, [0, 1, 2, 3])
+    simd_shuffle4!(r, x, [0, 1, 2, 3])
 }

 /// Rounding Add returning High Narrow.
@@ -1693,7 +1693,7 @@ pub unsafe fn vraddhn_u64(a: uint64x2_t, b: uint64x2_t) -> uint32x2_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(raddhn2))]
 pub unsafe fn vraddhn_high_s16(r: int8x8_t, a: int16x8_t, b: int16x8_t) -> int8x16_t {
    let x = vraddhn_s16_(a, b);
-    simd_shuffle16(r, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+    simd_shuffle16!(r, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
 }

 /// Rounding Add returning High Narrow (high half).
@@ -1704,7 +1704,7 @@ pub unsafe fn vraddhn_high_s16(r: int8x8_t, a: int16x8_t, b: int16x8_t) -> int8x
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(raddhn2))]
 pub unsafe fn vraddhn_high_s32(r: int16x4_t, a: int32x4_t, b: int32x4_t) -> int16x8_t {
    let x = vraddhn_s32_(a, b);
-    simd_shuffle8(r, x, [0, 1, 2, 3, 4, 5, 6, 7])
+    simd_shuffle8!(r, x, [0, 1, 2, 3, 4, 5, 6, 7])
 }

 /// Rounding Add returning High Narrow (high half).
@@ -1715,7 +1715,7 @@ pub unsafe fn vraddhn_high_s32(r: int16x4_t, a: int32x4_t, b: int32x4_t) -> int1
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(raddhn2))]
 pub unsafe fn vraddhn_high_s64(r: int32x2_t, a: int64x2_t, b: int64x2_t) -> int32x4_t {
    let x = vraddhn_s64_(a, b);
-    simd_shuffle4(r, x, [0, 1, 2, 3])
+    simd_shuffle4!(r, x, [0, 1, 2, 3])
 }

 /// Rounding Add returning High Narrow (high half).
@@ -1726,7 +1726,7 @@ pub unsafe fn vraddhn_high_s64(r: int32x2_t, a: int64x2_t, b: int64x2_t) -> int3
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(raddhn2))]
 pub unsafe fn vraddhn_high_u16(r: uint8x8_t, a: uint16x8_t, b: uint16x8_t) -> uint8x16_t {
    let x: uint8x8_t = transmute(vraddhn_s16_(transmute(a), transmute(b)));
-    simd_shuffle16(r, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+    simd_shuffle16!(r, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
 }

 /// Rounding Add returning High Narrow (high half).
@@ -1737,7 +1737,7 @@ pub unsafe fn vraddhn_high_u16(r: uint8x8_t, a: uint16x8_t, b: uint16x8_t) -> ui
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(raddhn2))]
 pub unsafe fn vraddhn_high_u32(r: uint16x4_t, a: uint32x4_t, b: uint32x4_t) -> uint16x8_t {
    let x: uint16x4_t = transmute(vraddhn_s32_(transmute(a), transmute(b)));
-    simd_shuffle8(r, x, [0, 1, 2, 3, 4, 5, 6, 7])
+    simd_shuffle8!(r, x, [0, 1, 2, 3, 4, 5, 6, 7])
 }

 /// Rounding Add returning High Narrow (high half).
@@ -1748,7 +1748,7 @@ pub unsafe fn vraddhn_high_u32(r: uint16x4_t, a: uint32x4_t, b: uint32x4_t) -> u
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(raddhn2))]
 pub unsafe fn vraddhn_high_u64(r: uint32x2_t, a: uint64x2_t, b: uint64x2_t) -> uint32x4_t {
    let x: uint32x2_t = transmute(vraddhn_s64_(transmute(a), transmute(b)));
-    simd_shuffle4(r, x, [0, 1, 2, 3])
+    simd_shuffle4!(r, x, [0, 1, 2, 3])
 }

 /// Signed Add Long Pairwise.
@@ -2961,7 +2961,7 @@ pub unsafe fn vget_lane_u8<const IMM5: i32>(v: uint8x8_t) -> u8 {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
 pub unsafe fn vget_high_s8(a: int8x16_t) -> int8x8_t {
-    simd_shuffle8(a, a, [8, 9, 10, 11, 12, 13, 14, 15])
+    simd_shuffle8!(a, a, [8, 9, 10, 11, 12, 13, 14, 15])
 }

 /// Duplicate vector element to vector or scalar
@@ -2971,7 +2971,7 @@ pub unsafe fn vget_high_s8(a: int8x16_t) -> int8x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
 pub unsafe fn vget_high_s16(a: int16x8_t) -> int16x4_t {
-    simd_shuffle4(a, a, [4, 5, 6, 7])
+    simd_shuffle4!(a, a, [4, 5, 6, 7])
 }

 /// Duplicate vector element to vector or scalar
@@ -2981,7 +2981,7 @@ pub unsafe fn vget_high_s16(a: int16x8_t) -> int16x4_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
 pub unsafe fn vget_high_s32(a: int32x4_t) -> int32x2_t {
-    simd_shuffle2(a, a, [2, 3])
+    simd_shuffle2!(a, a, [2, 3])
 }

 /// Duplicate vector element to vector or scalar
@@ -3001,7 +3001,7 @@ pub unsafe fn vget_high_s64(a: int64x2_t) -> int64x1_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
 pub unsafe fn vget_high_u8(a: uint8x16_t) -> uint8x8_t {
-    simd_shuffle8(a, a, [8, 9, 10, 11, 12, 13, 14, 15])
+    simd_shuffle8!(a, a, [8, 9, 10, 11, 12, 13, 14, 15])
 }

 /// Duplicate vector element to vector or scalar
@@ -3011,7 +3011,7 @@ pub unsafe fn vget_high_u8(a: uint8x16_t) -> uint8x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
 pub unsafe fn vget_high_u16(a: uint16x8_t) -> uint16x4_t {
-    simd_shuffle4(a, a, [4, 5, 6, 7])
+    simd_shuffle4!(a, a, [4, 5, 6, 7])
 }

 /// Duplicate vector element to vector or scalar
@@ -3021,7 +3021,7 @@ pub unsafe fn vget_high_u16(a: uint16x8_t) -> uint16x4_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
 pub unsafe fn vget_high_u32(a: uint32x4_t) -> uint32x2_t {
-    simd_shuffle2(a, a, [2, 3])
+    simd_shuffle2!(a, a, [2, 3])
 }

 /// Duplicate vector element to vector or scalar
@@ -3041,7 +3041,7 @@ pub unsafe fn vget_high_u64(a: uint64x2_t) -> uint64x1_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
 pub unsafe fn vget_high_p8(a: poly8x16_t) -> poly8x8_t {
-    simd_shuffle8(a, a, [8, 9, 10, 11, 12, 13, 14, 15])
+    simd_shuffle8!(a, a, [8, 9, 10, 11, 12, 13, 14, 15])
 }

 /// Duplicate vector element to vector or scalar
@@ -3051,7 +3051,7 @@ pub unsafe fn vget_high_p8(a: poly8x16_t) -> poly8x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
 pub unsafe fn vget_high_p16(a: poly16x8_t) -> poly16x4_t {
-    simd_shuffle4(a, a, [4, 5, 6, 7])
+    simd_shuffle4!(a, a, [4, 5, 6, 7])
 }

 /// Duplicate vector element to vector or scalar
@@ -3061,7 +3061,7 @@ pub unsafe fn vget_high_p16(a: poly16x8_t) -> poly16x4_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
 pub unsafe fn vget_high_f32(a: float32x4_t) -> float32x2_t {
-    simd_shuffle2(a, a, [2, 3])
+    simd_shuffle2!(a, a, [2, 3])
 }

 /// Duplicate vector element to vector or scalar
@@ -3071,7 +3071,7 @@ pub unsafe fn vget_high_f32(a: float32x4_t) -> float32x2_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("ldr"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))]
 pub unsafe fn vget_low_s8(a: int8x16_t) -> int8x8_t {
-    simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7])
+    simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7])
 }

 /// Duplicate vector element to vector or scalar
@@ -3081,7 +3081,7 @@ pub unsafe fn vget_low_s8(a: int8x16_t) -> int8x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("ldr"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))]
 pub unsafe fn vget_low_s16(a: int16x8_t) -> int16x4_t {
-    simd_shuffle4(a, a, [0, 1, 2, 3])
+    simd_shuffle4!(a, a, [0, 1, 2, 3])
 }

 /// Duplicate vector element to vector or scalar
@@ -3091,7 +3091,7 @@ pub unsafe fn vget_low_s16(a: int16x8_t) -> int16x4_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("ldr"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))]
 pub unsafe fn vget_low_s32(a: int32x4_t) -> int32x2_t {
-    simd_shuffle2(a, a, [0, 1])
+    simd_shuffle2!(a, a, [0, 1])
 }

 /// Duplicate vector element to vector or scalar
@@ -3111,7 +3111,7 @@ pub unsafe fn vget_low_s64(a: int64x2_t) -> int64x1_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("ldr"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))]
 pub unsafe fn vget_low_u8(a: uint8x16_t) -> uint8x8_t {
-    simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7])
+    simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7])
 }

 /// Duplicate vector element to vector or scalar
@@ -3121,7 +3121,7 @@ pub unsafe fn vget_low_u8(a: uint8x16_t) -> uint8x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("ldr"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))]
 pub unsafe fn vget_low_u16(a: uint16x8_t) -> uint16x4_t {
-    simd_shuffle4(a, a, [0, 1, 2, 3])
+    simd_shuffle4!(a, a, [0, 1, 2, 3])
 }

 /// Duplicate vector element to vector or scalar
@@ -3131,7 +3131,7 @@ pub unsafe fn vget_low_u16(a: uint16x8_t) -> uint16x4_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("ldr"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))]
 pub unsafe fn vget_low_u32(a: uint32x4_t) -> uint32x2_t {
-    simd_shuffle2(a, a, [0, 1])
+    simd_shuffle2!(a, a, [0, 1])
 }

 /// Duplicate vector element to vector or scalar
@@ -3151,7 +3151,7 @@ pub unsafe fn vget_low_u64(a: uint64x2_t) -> uint64x1_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("ldr"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))]
 pub unsafe fn vget_low_p8(a: poly8x16_t) -> poly8x8_t {
-    simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7])
+    simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7])
 }

 /// Duplicate vector element to vector or scalar
@@ -3161,7 +3161,7 @@ pub unsafe fn vget_low_p8(a: poly8x16_t) -> poly8x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("ldr"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))]
 pub unsafe fn vget_low_p16(a: poly16x8_t) -> poly16x4_t {
-    simd_shuffle4(a, a, [0, 1, 2, 3])
+    simd_shuffle4!(a, a, [0, 1, 2, 3])
 }

 /// Duplicate vector element to vector or scalar
@@ -3171,7 +3171,7 @@ pub unsafe fn vget_low_p16(a: poly16x8_t) -> poly16x4_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("ldr"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))]
 pub unsafe fn vget_low_f32(a: float32x4_t) -> float32x2_t {
-    simd_shuffle2(a, a, [0, 1])
+    simd_shuffle2!(a, a, [0, 1])
 }

 /// Duplicate vector element to vector or scalar
@@ -3713,7 +3713,7 @@ pub unsafe fn vcntq_p8(a: poly8x16_t) -> poly8x16_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev16.8"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev16))]
 pub unsafe fn vrev16_s8(a: int8x8_t) -> int8x8_t {
-    simd_shuffle8(a, a, [1, 0, 3, 2, 5, 4, 7, 6])
+    simd_shuffle8!(a, a, [1, 0, 3, 2, 5, 4, 7, 6])
 }

 /// Reversing vector elements (swap endianness)
@@ -3723,7 +3723,7 @@ pub unsafe fn vrev16_s8(a: int8x8_t) -> int8x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev16.8"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev16))]
 pub unsafe fn vrev16q_s8(a: int8x16_t) -> int8x16_t {
-    simd_shuffle16(a, a, [1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14])
+    simd_shuffle16!(a, a, [1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14])
 }

 /// Reversing vector elements (swap endianness)
@@ -3733,7 +3733,7 @@ pub unsafe fn vrev16q_s8(a: int8x16_t) -> int8x16_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev16.8"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev16))]
 pub unsafe fn vrev16_u8(a: uint8x8_t) -> uint8x8_t {
-    simd_shuffle8(a, a, [1, 0, 3, 2, 5, 4, 7, 6])
+    simd_shuffle8!(a, a, [1, 0, 3, 2, 5, 4, 7, 6])
 }

 /// Reversing vector elements (swap endianness)
@@ -3743,7 +3743,7 @@ pub unsafe fn vrev16_u8(a: uint8x8_t) -> uint8x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev16.8"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev16))]
 pub unsafe fn vrev16q_u8(a: uint8x16_t) -> uint8x16_t {
-    simd_shuffle16(a, a, [1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14])
+    simd_shuffle16!(a, a, [1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14])
 }

 /// Reversing vector elements (swap endianness)
@@ -3753,7 +3753,7 @@ pub unsafe fn vrev16q_u8(a: uint8x16_t) -> uint8x16_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev16.8"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev16))]
 pub unsafe fn vrev16_p8(a: poly8x8_t) -> poly8x8_t {
-    simd_shuffle8(a, a, [1, 0, 3, 2, 5, 4, 7, 6])
+    simd_shuffle8!(a, a, [1, 0, 3, 2, 5, 4, 7, 6])
 }

 /// Reversing vector elements (swap endianness)
@@ -3763,7 +3763,7 @@ pub unsafe fn vrev16_p8(a: poly8x8_t) -> poly8x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev16.8"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev16))]
 pub unsafe fn vrev16q_p8(a: poly8x16_t) -> poly8x16_t {
-    simd_shuffle16(a, a, [1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14])
+    simd_shuffle16!(a, a, [1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14])
 }

 /// Reversing vector elements (swap endianness)
@@ -3773,7 +3773,7 @@ pub unsafe fn vrev16q_p8(a: poly8x16_t) -> poly8x16_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.8"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
 pub unsafe fn vrev32_s8(a: int8x8_t) -> int8x8_t {
-    simd_shuffle8(a, a, [3, 2, 1, 0, 7, 6, 5, 4])
+    simd_shuffle8!(a, a, [3, 2, 1, 0, 7, 6, 5, 4])
 }

 /// Reversing vector elements (swap endianness)
@@ -3783,7 +3783,7 @@ pub unsafe fn vrev32_s8(a: int8x8_t) -> int8x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.8"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
 pub unsafe fn vrev32q_s8(a: int8x16_t) -> int8x16_t {
-    simd_shuffle16(a, a, [3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12])
+    simd_shuffle16!(a, a, [3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12])
 }

 /// Reversing vector elements (swap endianness)
@@ -3793,7 +3793,7 @@ pub unsafe fn vrev32q_s8(a: int8x16_t) -> int8x16_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.8"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
 pub unsafe fn vrev32_u8(a: uint8x8_t) -> uint8x8_t {
-    simd_shuffle8(a, a, [3, 2, 1, 0, 7, 6, 5, 4])
+    simd_shuffle8!(a, a, [3, 2, 1, 0, 7, 6, 5, 4])
 }

 /// Reversing vector elements (swap endianness)
@@ -3803,7 +3803,7 @@ pub unsafe fn vrev32_u8(a: uint8x8_t) -> uint8x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.8"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
 pub unsafe fn vrev32q_u8(a: uint8x16_t) -> uint8x16_t {
-    simd_shuffle16(a, a, [3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12])
+    simd_shuffle16!(a, a, [3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12])
 }

 /// Reversing vector elements (swap endianness)
@@ -3813,7 +3813,7 @@ pub unsafe fn vrev32q_u8(a: uint8x16_t) -> uint8x16_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.16"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
 pub unsafe fn vrev32_s16(a: int16x4_t) -> int16x4_t {
-    simd_shuffle4(a, a, [1, 0, 3, 2])
+    simd_shuffle4!(a, a, [1, 0, 3, 2])
 }

 /// Reversing vector elements (swap endianness)
@@ -3823,7 +3823,7 @@ pub unsafe fn vrev32_s16(a: int16x4_t) -> int16x4_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.16"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
 pub unsafe fn vrev32q_s16(a: int16x8_t) -> int16x8_t {
-    simd_shuffle8(a, a, [1, 0, 3, 2, 5, 4, 7, 6])
+    simd_shuffle8!(a, a, [1, 0, 3, 2, 5, 4, 7, 6])
 }

 /// Reversing vector elements (swap endianness)
@@ -3833,7 +3833,7 @@ pub unsafe fn vrev32q_s16(a: int16x8_t) -> int16x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.16"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
 pub unsafe fn vrev32_p16(a: poly16x4_t) -> poly16x4_t {
-    simd_shuffle4(a, a, [1, 0, 3, 2])
+    simd_shuffle4!(a, a, [1, 0, 3, 2])
 }

 /// Reversing vector elements (swap endianness)
@@ -3843,7 +3843,7 @@ pub unsafe fn vrev32_p16(a: poly16x4_t) -> poly16x4_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.16"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
 pub unsafe fn vrev32q_p16(a: poly16x8_t) -> poly16x8_t {
-    simd_shuffle8(a, a, [1, 0, 3, 2, 5, 4, 7, 6])
+    simd_shuffle8!(a, a, [1, 0, 3, 2, 5, 4, 7, 6])
 }

 /// Reversing vector elements (swap endianness)
@@ -3853,7 +3853,7 @@ pub unsafe fn vrev32q_p16(a: poly16x8_t) -> poly16x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.16"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
 pub unsafe fn vrev32_u16(a: uint16x4_t) -> uint16x4_t {
-    simd_shuffle4(a, a, [1, 0, 3, 2])
+    simd_shuffle4!(a, a, [1, 0, 3, 2])
 }

 /// Reversing vector elements (swap endianness)
@@ -3863,7 +3863,7 @@ pub unsafe fn vrev32_u16(a: uint16x4_t) -> uint16x4_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.16"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
 pub unsafe fn vrev32q_u16(a: uint16x8_t) -> uint16x8_t {
-    simd_shuffle8(a, a, [1, 0, 3, 2, 5, 4, 7, 6])
+    simd_shuffle8!(a, a, [1, 0, 3, 2, 5, 4, 7, 6])
 }

 /// Reversing vector elements (swap endianness)
@@ -3873,7 +3873,7 @@ pub unsafe fn vrev32q_u16(a: uint16x8_t) -> uint16x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.8"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
 pub unsafe fn vrev32_p8(a: poly8x8_t) -> poly8x8_t {
-    simd_shuffle8(a, a, [3, 2, 1, 0, 7, 6, 5, 4])
+    simd_shuffle8!(a, a, [3, 2, 1, 0, 7, 6, 5, 4])
 }

 /// Reversing vector elements (swap endianness)
@@ -3883,7 +3883,7 @@ pub unsafe fn vrev32_p8(a: poly8x8_t) -> poly8x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.8"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
 pub unsafe fn vrev32q_p8(a: poly8x16_t) -> poly8x16_t {
-    simd_shuffle16(a, a, [3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12])
+    simd_shuffle16!(a, a, [3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12])
 }

 /// Reversing vector elements (swap endianness)
@@ -3893,7 +3893,7 @@ pub unsafe fn vrev32q_p8(a: poly8x16_t) -> poly8x16_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.8"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
 pub unsafe fn vrev64_s8(a: int8x8_t) -> int8x8_t {
-    simd_shuffle8(a, a, [7, 6, 5, 4, 3, 2, 1, 0])
+    simd_shuffle8!(a, a, [7, 6, 5, 4, 3, 2, 1, 0])
 }

 /// Reversing vector elements (swap endianness)
@@ -3903,7 +3903,7 @@ pub unsafe fn vrev64_s8(a: int8x8_t) -> int8x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.8"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
 pub unsafe fn vrev64q_s8(a: int8x16_t) -> int8x16_t {
-    simd_shuffle16(a, a, [7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8])
+    simd_shuffle16!(a, a, [7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8])
 }

 /// Reversing vector elements (swap endianness)
@@ -3913,7 +3913,7 @@ pub unsafe fn vrev64q_s8(a: int8x16_t) -> int8x16_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.16"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
 pub unsafe fn vrev64_s16(a: int16x4_t) -> int16x4_t {
-    simd_shuffle4(a, a, [3, 2, 1, 0])
+    simd_shuffle4!(a, a, [3, 2, 1, 0])
 }

 /// Reversing vector elements (swap endianness)
@@ -3923,7 +3923,7 @@ pub unsafe fn vrev64_s16(a: int16x4_t) -> int16x4_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.16"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
 pub unsafe fn vrev64q_s16(a: int16x8_t) -> int16x8_t {
-    simd_shuffle8(a, a, [3, 2, 1, 0, 7, 6, 5, 4])
+    simd_shuffle8!(a, a, [3, 2, 1, 0, 7, 6, 5, 4])
 }

 /// Reversing vector elements (swap endianness)
@@ -3933,7 +3933,7 @@ pub unsafe fn vrev64q_s16(a: int16x8_t) -> int16x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.32"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
 pub unsafe fn vrev64_s32(a: int32x2_t) -> int32x2_t {
-    simd_shuffle2(a, a, [1, 0])
+    simd_shuffle2!(a, a, [1, 0])
 }

 /// Reversing vector elements (swap endianness)
@@ -3943,7 +3943,7 @@ pub unsafe fn vrev64_s32(a: int32x2_t) -> int32x2_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.32"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
 pub unsafe fn vrev64q_s32(a: int32x4_t) -> int32x4_t {
-    simd_shuffle4(a, a, [1, 0, 3, 2])
+    simd_shuffle4!(a, a, [1, 0, 3, 2])
 }

 /// Reversing vector elements (swap endianness)
@@ -3953,7 +3953,7 @@ pub unsafe fn vrev64q_s32(a: int32x4_t) -> int32x4_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.8"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
 pub unsafe fn vrev64_u8(a: uint8x8_t) -> uint8x8_t {
-    simd_shuffle8(a, a, [7, 6, 5, 4, 3, 2, 1, 0])
+    simd_shuffle8!(a, a, [7, 6, 5, 4, 3, 2, 1, 0])
 }

 /// Reversing vector elements (swap endianness)
@@ -3963,7 +3963,7 @@ pub unsafe fn vrev64_u8(a: uint8x8_t) -> uint8x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.8"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
 pub unsafe fn vrev64q_u8(a: uint8x16_t) -> uint8x16_t {
-    simd_shuffle16(a, a, [7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8])
+    simd_shuffle16!(a, a, [7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8])
 }

 /// Reversing vector elements (swap endianness)
@@ -3973,7 +3973,7 @@ pub unsafe fn vrev64q_u8(a: uint8x16_t) -> uint8x16_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.16"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
 pub unsafe fn vrev64_u16(a: uint16x4_t) -> uint16x4_t {
-    simd_shuffle4(a, a, [3, 2, 1, 0])
+    simd_shuffle4!(a, a, [3, 2, 1, 0])
 }

 /// Reversing vector elements (swap endianness)
@@ -3983,7 +3983,7 @@ pub unsafe fn vrev64_u16(a: uint16x4_t) -> uint16x4_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.16"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
 pub unsafe fn vrev64q_u16(a: uint16x8_t) -> uint16x8_t {
-    simd_shuffle8(a, a, [3, 2, 1, 0, 7, 6, 5, 4])
+    simd_shuffle8!(a, a, [3, 2, 1, 0, 7, 6, 5, 4])
 }

 /// Reversing vector elements (swap endianness)
@@ -3993,7 +3993,7 @@ pub unsafe fn vrev64q_u16(a: uint16x8_t) -> uint16x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.32"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
 pub unsafe fn vrev64_u32(a: uint32x2_t) -> uint32x2_t {
-    simd_shuffle2(a, a, [1, 0])
+    simd_shuffle2!(a, a, [1, 0])
 }

 /// Reversing vector elements (swap endianness)
@@ -4003,7 +4003,7 @@ pub unsafe fn vrev64_u32(a: uint32x2_t) -> uint32x2_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.32"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
 pub unsafe fn vrev64q_u32(a: uint32x4_t) -> uint32x4_t {
-    simd_shuffle4(a, a, [1, 0, 3, 2])
+    simd_shuffle4!(a, a, [1, 0, 3, 2])
 }

 /// Reversing vector elements (swap endianness)
@@ -4013,7 +4013,7 @@ pub unsafe fn vrev64q_u32(a: uint32x4_t) -> uint32x4_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.32"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
 pub unsafe fn vrev64_f32(a: float32x2_t) -> float32x2_t {
-    simd_shuffle2(a, a, [1, 0])
+    simd_shuffle2!(a, a, [1, 0])
 }

 /// Reversing vector elements (swap endianness)
@@ -4023,7 +4023,7 @@ pub unsafe fn vrev64_f32(a: float32x2_t) -> float32x2_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.32"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
 pub unsafe fn vrev64q_f32(a: float32x4_t) -> float32x4_t {
-    simd_shuffle4(a, a, [1, 0, 3, 2])
+    simd_shuffle4!(a, a, [1, 0, 3, 2])
 }

 /// Reversing vector elements (swap endianness)
@@ -4033,7 +4033,7 @@ pub unsafe fn vrev64q_f32(a: float32x4_t) -> float32x4_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.8"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
 pub unsafe fn vrev64_p8(a: poly8x8_t) -> poly8x8_t {
-    simd_shuffle8(a, a, [7, 6, 5, 4, 3, 2, 1, 0])
+    simd_shuffle8!(a, a, [7, 6, 5, 4, 3, 2, 1, 0])
 }

 /// Reversing vector elements (swap endianness)
@@ -4043,7 +4043,7 @@ pub unsafe fn vrev64_p8(a: poly8x8_t) -> poly8x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.8"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
 pub unsafe fn vrev64q_p8(a: poly8x16_t) -> poly8x16_t {
-    simd_shuffle16(a, a, [7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8])
+    simd_shuffle16!(a, a, [7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8])
 }

 /// Reversing vector elements (swap endianness)
@@ -4053,7 +4053,7 @@ pub unsafe fn vrev64q_p8(a: poly8x16_t) -> poly8x16_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.16"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
 pub unsafe fn vrev64_p16(a: poly16x4_t) -> poly16x4_t {
-    simd_shuffle4(a, a, [3, 2, 1, 0])
+    simd_shuffle4!(a, a, [3, 2, 1, 0])
 }

 /// Reversing vector elements (swap endianness)
@@ -4063,7 +4063,7 @@ pub unsafe fn vrev64_p16(a: poly16x4_t) -> poly16x4_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.16"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
 pub unsafe fn vrev64q_p16(a: poly16x8_t) -> poly16x8_t {
-    simd_shuffle8(a, a, [3, 2, 1, 0, 7, 6, 5, 4])
+    simd_shuffle8!(a, a, [3, 2, 1, 0, 7, 6, 5, 4])
 }

 /// Signed Add and Accumulate Long Pairwise.
@@ -92,3 +92,99 @@ macro_rules! types {
        pub struct $name($($fields)*);
    )*)
 }
+
+#[allow(unused_macros)]
+macro_rules! simd_shuffle2 {
+    ($x:expr, $y:expr, <$(const $imm:ident : $ty:ty),+> $idx:expr $(,)?) => {{
+        struct ConstParam<$(const $imm: $ty),+>;
+        impl<$(const $imm: $ty),+> ConstParam<$($imm),+> {
+            const IDX: [u32; 2] = $idx;
+        }
+
+        simd_shuffle2($x, $y, ConstParam::<$($imm),+>::IDX)
+    }};
+    ($x:expr, $y:expr, $idx:expr $(,)?) => {{
+        const IDX: [u32; 2] = $idx;
+        simd_shuffle2($x, $y, IDX)
+    }};
+}
+
+#[allow(unused_macros)]
+macro_rules! simd_shuffle4 {
+    ($x:expr, $y:expr, <$(const $imm:ident : $ty:ty),+> $idx:expr $(,)?) => {{
+        struct ConstParam<$(const $imm: $ty),+>;
+        impl<$(const $imm: $ty),+> ConstParam<$($imm),+> {
+            const IDX: [u32; 4] = $idx;
+        }
+
+        simd_shuffle4($x, $y, ConstParam::<$($imm),+>::IDX)
+    }};
+    ($x:expr, $y:expr, $idx:expr $(,)?) => {{
+        const IDX: [u32; 4] = $idx;
+        simd_shuffle4($x, $y, IDX)
+    }};
+}
+
+#[allow(unused_macros)]
+macro_rules! simd_shuffle8 {
+    ($x:expr, $y:expr, <$(const $imm:ident : $ty:ty),+> $idx:expr $(,)?) => {{
+        struct ConstParam<$(const $imm: $ty),+>;
+        impl<$(const $imm: $ty),+> ConstParam<$($imm),+> {
+            const IDX: [u32; 8] = $idx;
+        }
+
+        simd_shuffle8($x, $y, ConstParam::<$($imm),+>::IDX)
+    }};
+    ($x:expr, $y:expr, $idx:expr $(,)?) => {{
+        const IDX: [u32; 8] = $idx;
+        simd_shuffle8($x, $y, IDX)
+    }};
+}
+
+#[allow(unused_macros)]
+macro_rules! simd_shuffle16 {
+    ($x:expr, $y:expr, <$(const $imm:ident : $ty:ty),+> $idx:expr $(,)?) => {{
+        struct ConstParam<$(const $imm: $ty),+>;
+        impl<$(const $imm: $ty),+> ConstParam<$($imm),+> {
+            const IDX: [u32; 16] = $idx;
+        }
+
+        simd_shuffle16($x, $y, ConstParam::<$($imm),+>::IDX)
+    }};
+    ($x:expr, $y:expr, $idx:expr $(,)?) => {{
+        const IDX: [u32; 16] = $idx;
+        simd_shuffle16($x, $y, IDX)
+    }};
+}
+
+#[allow(unused_macros)]
+macro_rules! simd_shuffle32 {
+    ($x:expr, $y:expr, <$(const $imm:ident : $ty:ty),+> $idx:expr $(,)?) => {{
+        struct ConstParam<$(const $imm: $ty),+>;
+        impl<$(const $imm: $ty),+> ConstParam<$($imm),+> {
+            const IDX: [u32; 32] = $idx;
+        }
+
+        simd_shuffle32($x, $y, ConstParam::<$($imm),+>::IDX)
+    }};
+    ($x:expr, $y:expr, $idx:expr $(,)?) => {{
+        const IDX: [u32; 32] = $idx;
+        simd_shuffle32($x, $y, IDX)
+    }};
+}
+
+#[allow(unused_macros)]
+macro_rules! simd_shuffle64 {
+    ($x:expr, $y:expr, <$(const $imm:ident : $ty:ty),+> $idx:expr $(,)?) => {{
+        struct ConstParam<$(const $imm: $ty),+>;
+        impl<$(const $imm: $ty),+> ConstParam<$($imm),+> {
+            const IDX: [u32; 64] = $idx;
+        }
+
+        simd_shuffle64($x, $y, ConstParam::<$($imm),+>::IDX)
+    }};
+    ($x:expr, $y:expr, $idx:expr $(,)?) => {{
+        const IDX: [u32; 64] = $idx;
+        simd_shuffle64($x, $y, IDX)
+    }};
+}
@@ -47,10 +47,10 @@ pub trait VectorPermDI {
    #[cfg_attr(all(test, target_endian = "big"), assert_instr(xxspltd, dm = 0x0))]
    unsafe fn xxpermdi(a: i64x2, b: i64x2, dm: u8) -> i64x2 {
        match dm & 0b11 {
-            0 => simd_shuffle2(a, b, [0b00, 0b10]),
-            1 => simd_shuffle2(a, b, [0b01, 0b10]),
-            2 => simd_shuffle2(a, b, [0b00, 0b11]),
-            _ => simd_shuffle2(a, b, [0b01, 0b11]),
+            0 => simd_shuffle2!(a, b, [0b00, 0b10]),
+            1 => simd_shuffle2!(a, b, [0b01, 0b10]),
+            2 => simd_shuffle2!(a, b, [0b00, 0b11]),
+            _ => simd_shuffle2!(a, b, [0b01, 0b11]),
        }
    }

@@ -118,10 +118,10 @@ pub unsafe fn _mm256_or_ps(a: __m256, b: __m256) -> __m256 {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_shuffle_pd<const MASK: i32>(a: __m256d, b: __m256d) -> __m256d {
    static_assert_imm8!(MASK);
-    simd_shuffle4(
+    simd_shuffle4!(
        a,
        b,
-        [
+        <const MASK: i32> [
            MASK as u32 & 0b1,
            ((MASK as u32 >> 1) & 0b1) + 4,
            ((MASK as u32 >> 2) & 0b1) + 2,
@@ -141,10 +141,10 @@ pub unsafe fn _mm256_shuffle_pd<const MASK: i32>(a: __m256d, b: __m256d) -> __m2
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_shuffle_ps<const MASK: i32>(a: __m256, b: __m256) -> __m256 {
    static_assert_imm8!(MASK);
-    simd_shuffle8(
+    simd_shuffle8!(
        a,
        b,
-        [
+        <const MASK: i32> [
            MASK as u32 & 0b11,
            (MASK as u32 >> 2) & 0b11,
            ((MASK as u32 >> 4) & 0b11) + 8,
@@ -463,10 +463,10 @@ pub unsafe fn _mm256_sqrt_pd(a: __m256d) -> __m256d {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_blend_pd<const IMM4: i32>(a: __m256d, b: __m256d) -> __m256d {
    static_assert_imm4!(IMM4);
-    simd_shuffle4(
+    simd_shuffle4!(
        a,
        b,
-        [
+        <const IMM4: i32> [
            ((IMM4 as u32 >> 0) & 1) * 4 + 0,
            ((IMM4 as u32 >> 1) & 1) * 4 + 1,
            ((IMM4 as u32 >> 2) & 1) * 4 + 2,
@@ -486,10 +486,10 @@ pub unsafe fn _mm256_blend_pd<const IMM4: i32>(a: __m256d, b: __m256d) -> __m256
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_blend_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
    static_assert_imm8!(IMM8);
-    simd_shuffle8(
+    simd_shuffle8!(
        a,
        b,
-        [
+        <const IMM8: i32> [
            ((IMM8 as u32 >> 0) & 1) * 8 + 0,
            ((IMM8 as u32 >> 1) & 1) * 8 + 1,
            ((IMM8 as u32 >> 2) & 1) * 8 + 2,
@@ -930,10 +930,10 @@ pub unsafe fn _mm256_cvttps_epi32(a: __m256) -> __m256i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_extractf128_ps<const IMM1: i32>(a: __m256) -> __m128 {
    static_assert_imm1!(IMM1);
-    simd_shuffle4(
+    simd_shuffle4!(
        a,
        _mm256_undefined_ps(),
-        [[0, 1, 2, 3], [4, 5, 6, 7]][IMM1 as usize],
+        <const IMM1: i32> [[0, 1, 2, 3], [4, 5, 6, 7]][IMM1 as usize],
    )
 }

@@ -951,7 +951,7 @@ pub unsafe fn _mm256_extractf128_ps<const IMM1: i32>(a: __m256) -> __m128 {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_extractf128_pd<const IMM1: i32>(a: __m256d) -> __m128d {
    static_assert_imm1!(IMM1);
-    simd_shuffle2(a, _mm256_undefined_pd(), [[0, 1], [2, 3]][IMM1 as usize])
+    simd_shuffle2!(a, _mm256_undefined_pd(), <const IMM1: i32> [[0, 1], [2, 3]][IMM1 as usize])
 }

 /// Extracts 128 bits (composed of integer data) from `a`, selected with `imm8`.
@@ -967,10 +967,10 @@ pub unsafe fn _mm256_extractf128_pd<const IMM1: i32>(a: __m256d) -> __m128d {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_extractf128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
    static_assert_imm1!(IMM1);
-    let dst: i64x2 = simd_shuffle2(
+    let dst: i64x2 = simd_shuffle2!(
        a.as_i64x4(),
        _mm256_undefined_si256().as_i64x4(),
-        [[0, 1], [2, 3]][IMM1 as usize],
+        <const IMM1: i32> [[0, 1], [2, 3]][IMM1 as usize],
    );
    transmute(dst)
 }
@@ -1033,10 +1033,10 @@ pub unsafe fn _mm_permutevar_ps(a: __m128, b: __m128i) -> __m128 {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_permute_ps<const IMM8: i32>(a: __m256) -> __m256 {
    static_assert_imm8!(IMM8);
-    simd_shuffle8(
+    simd_shuffle8!(
        a,
        _mm256_undefined_ps(),
-        [
+        <const IMM8: i32> [
            (IMM8 as u32 >> 0) & 0b11,
            (IMM8 as u32 >> 2) & 0b11,
            (IMM8 as u32 >> 4) & 0b11,
@@ -1060,10 +1060,10 @@ pub unsafe fn _mm256_permute_ps<const IMM8: i32>(a: __m256) -> __m256 {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_permute_ps<const IMM8: i32>(a: __m128) -> __m128 {
    static_assert_imm8!(IMM8);
-    simd_shuffle4(
+    simd_shuffle4!(
        a,
        _mm_undefined_ps(),
-        [
+        <const IMM8: i32> [
            (IMM8 as u32 >> 0) & 0b11,
            (IMM8 as u32 >> 2) & 0b11,
            (IMM8 as u32 >> 4) & 0b11,
@@ -1107,10 +1107,10 @@ pub unsafe fn _mm_permutevar_pd(a: __m128d, b: __m128i) -> __m128d {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_permute_pd<const IMM4: i32>(a: __m256d) -> __m256d {
    static_assert_imm4!(IMM4);
-    simd_shuffle4(
+    simd_shuffle4!(
        a,
        _mm256_undefined_pd(),
-        [
+        <const IMM4: i32> [
            ((IMM4 as u32 >> 0) & 1),
            ((IMM4 as u32 >> 1) & 1),
            ((IMM4 as u32 >> 2) & 1) + 2,
@@ -1130,10 +1130,10 @@ pub unsafe fn _mm256_permute_pd<const IMM4: i32>(a: __m256d) -> __m256d {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_permute_pd<const IMM2: i32>(a: __m128d) -> __m128d {
    static_assert_imm2!(IMM2);
-    simd_shuffle2(
+    simd_shuffle2!(
        a,
        _mm_undefined_pd(),
-        [(IMM2 as u32) & 1, (IMM2 as u32 >> 1) & 1],
+        <const IMM2: i32> [(IMM2 as u32) & 1, (IMM2 as u32 >> 1) & 1],
    )
 }

@@ -1257,10 +1257,10 @@ pub unsafe fn _mm256_broadcast_pd(a: &__m128d) -> __m256d {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_insertf128_ps<const IMM1: i32>(a: __m256, b: __m128) -> __m256 {
    static_assert_imm1!(IMM1);
-    simd_shuffle8(
+    simd_shuffle8!(
        a,
        _mm256_castps128_ps256(b),
-        [[8, 9, 10, 11, 4, 5, 6, 7], [0, 1, 2, 3, 8, 9, 10, 11]][IMM1 as usize],
+        <const IMM1: i32> [[8, 9, 10, 11, 4, 5, 6, 7], [0, 1, 2, 3, 8, 9, 10, 11]][IMM1 as usize],
    )
 }

@@ -1279,10 +1279,10 @@ pub unsafe fn _mm256_insertf128_ps<const IMM1: i32>(a: __m256, b: __m128) -> __m
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_insertf128_pd<const IMM1: i32>(a: __m256d, b: __m128d) -> __m256d {
    static_assert_imm1!(IMM1);
-    simd_shuffle4(
+    simd_shuffle4!(
        a,
        _mm256_castpd128_pd256(b),
-        [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
+        <const IMM1: i32> [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
    )
 }

@@ -1300,10 +1300,10 @@ pub unsafe fn _mm256_insertf128_pd<const IMM1: i32>(a: __m256d, b: __m128d) -> _
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_insertf128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
    static_assert_imm1!(IMM1);
-    let dst: i64x4 = simd_shuffle4(
+    let dst: i64x4 = simd_shuffle4!(
        a.as_i64x4(),
        _mm256_castsi128_si256(b).as_i64x4(),
-        [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
+        <const IMM1: i32> [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
    );
    transmute(dst)
 }
@@ -1639,7 +1639,7 @@ pub unsafe fn _mm_maskstore_ps(mem_addr: *mut f32, mask: __m128i, a: __m128) {
 #[cfg_attr(test, assert_instr(vmovshdup))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_movehdup_ps(a: __m256) -> __m256 {
-    simd_shuffle8(a, a, [1, 1, 3, 3, 5, 5, 7, 7])
+    simd_shuffle8!(a, a, [1, 1, 3, 3, 5, 5, 7, 7])
 }

 /// Duplicate even-indexed single-precision (32-bit) floating-point elements
@@ -1651,7 +1651,7 @@ pub unsafe fn _mm256_movehdup_ps(a: __m256) -> __m256 {
 #[cfg_attr(test, assert_instr(vmovsldup))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_moveldup_ps(a: __m256) -> __m256 {
-    simd_shuffle8(a, a, [0, 0, 2, 2, 4, 4, 6, 6])
+    simd_shuffle8!(a, a, [0, 0, 2, 2, 4, 4, 6, 6])
 }

 /// Duplicate even-indexed double-precision (64-bit) floating-point elements
@@ -1663,7 +1663,7 @@ pub unsafe fn _mm256_moveldup_ps(a: __m256) -> __m256 {
 #[cfg_attr(test, assert_instr(vmovddup))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_movedup_pd(a: __m256d) -> __m256d {
-    simd_shuffle4(a, a, [0, 0, 2, 2])
+    simd_shuffle4!(a, a, [0, 0, 2, 2])
 }

 /// Loads 256-bits of integer data from unaligned memory into result.
@@ -1756,7 +1756,7 @@ pub unsafe fn _mm256_rsqrt_ps(a: __m256) -> __m256 {
 #[cfg_attr(test, assert_instr(vunpckhpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_unpackhi_pd(a: __m256d, b: __m256d) -> __m256d {
-    simd_shuffle4(a, b, [1, 5, 3, 7])
+    simd_shuffle4!(a, b, [1, 5, 3, 7])
 }

 /// Unpacks and interleave single-precision (32-bit) floating-point elements
@@ -1768,7 +1768,7 @@ pub unsafe fn _mm256_unpackhi_pd(a: __m256d, b: __m256d) -> __m256d {
 #[cfg_attr(test, assert_instr(vunpckhps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_unpackhi_ps(a: __m256, b: __m256) -> __m256 {
-    simd_shuffle8(a, b, [2, 10, 3, 11, 6, 14, 7, 15])
+    simd_shuffle8!(a, b, [2, 10, 3, 11, 6, 14, 7, 15])
 }

 /// Unpacks and interleave double-precision (64-bit) floating-point elements
@@ -1780,7 +1780,7 @@ pub unsafe fn _mm256_unpackhi_ps(a: __m256, b: __m256) -> __m256 {
 #[cfg_attr(test, assert_instr(vunpcklpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_unpacklo_pd(a: __m256d, b: __m256d) -> __m256d {
-    simd_shuffle4(a, b, [0, 4, 2, 6])
+    simd_shuffle4!(a, b, [0, 4, 2, 6])
 }

 /// Unpacks and interleave single-precision (32-bit) floating-point elements
@@ -1792,7 +1792,7 @@ pub unsafe fn _mm256_unpacklo_pd(a: __m256d, b: __m256d) -> __m256d {
 #[cfg_attr(test, assert_instr(vunpcklps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_unpacklo_ps(a: __m256, b: __m256) -> __m256 {
-    simd_shuffle8(a, b, [0, 8, 1, 9, 4, 12, 5, 13])
+    simd_shuffle8!(a, b, [0, 8, 1, 9, 4, 12, 5, 13])
 }

 /// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
@@ -2572,7 +2572,7 @@ pub unsafe fn _mm256_castsi256_pd(a: __m256i) -> __m256d {
 // instructions, thus it has zero latency.
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_castps256_ps128(a: __m256) -> __m128 {
-    simd_shuffle4(a, a, [0, 1, 2, 3])
+    simd_shuffle4!(a, a, [0, 1, 2, 3])
 }

 /// Casts vector of type __m256d to type __m128d.
@@ -2584,7 +2584,7 @@ pub unsafe fn _mm256_castps256_ps128(a: __m256) -> __m128 {
 // instructions, thus it has zero latency.
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_castpd256_pd128(a: __m256d) -> __m128d {
-    simd_shuffle2(a, a, [0, 1])
+    simd_shuffle2!(a, a, [0, 1])
 }

 /// Casts vector of type __m256i to type __m128i.
@@ -2597,7 +2597,7 @@ pub unsafe fn _mm256_castpd256_pd128(a: __m256d) -> __m128d {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
    let a = a.as_i64x4();
-    let dst: i64x2 = simd_shuffle2(a, a, [0, 1]);
+    let dst: i64x2 = simd_shuffle2!(a, a, [0, 1]);
    transmute(dst)
 }

@@ -2611,8 +2611,8 @@ pub unsafe fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
 // instructions, thus it has zero latency.
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_castps128_ps256(a: __m128) -> __m256 {
-    // FIXME simd_shuffle8(a, a, [0, 1, 2, 3, -1, -1, -1, -1])
-    simd_shuffle8(a, a, [0, 1, 2, 3, 0, 0, 0, 0])
+    // FIXME simd_shuffle8!(a, a, [0, 1, 2, 3, -1, -1, -1, -1])
+    simd_shuffle8!(a, a, [0, 1, 2, 3, 0, 0, 0, 0])
 }

 /// Casts vector of type __m128d to type __m256d;
@@ -2625,8 +2625,8 @@ pub unsafe fn _mm256_castps128_ps256(a: __m128) -> __m256 {
 // instructions, thus it has zero latency.
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_castpd128_pd256(a: __m128d) -> __m256d {
-    // FIXME simd_shuffle4(a, a, [0, 1, -1, -1])
-    simd_shuffle4(a, a, [0, 1, 0, 0])
+    // FIXME simd_shuffle4!(a, a, [0, 1, -1, -1])
+    simd_shuffle4!(a, a, [0, 1, 0, 0])
 }

 /// Casts vector of type __m128i to type __m256i;
@@ -2640,8 +2640,8 @@ pub unsafe fn _mm256_castpd128_pd256(a: __m128d) -> __m256d {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
    let a = a.as_i64x2();
-    // FIXME simd_shuffle4(a, a, [0, 1, -1, -1])
-    let dst: i64x4 = simd_shuffle4(a, a, [0, 1, 0, 0]);
+    // FIXME simd_shuffle4!(a, a, [0, 1, -1, -1])
+    let dst: i64x4 = simd_shuffle4!(a, a, [0, 1, 0, 0]);
    transmute(dst)
 }

@@ -2656,7 +2656,7 @@ pub unsafe fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
 // instructions, thus it has zero latency.
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_zextps128_ps256(a: __m128) -> __m256 {
-    simd_shuffle8(a, _mm_setzero_ps(), [0, 1, 2, 3, 4, 5, 6, 7])
+    simd_shuffle8!(a, _mm_setzero_ps(), [0, 1, 2, 3, 4, 5, 6, 7])
 }

 /// Constructs a 256-bit integer vector from a 128-bit integer vector.
@@ -2671,7 +2671,7 @@ pub unsafe fn _mm256_zextps128_ps256(a: __m128) -> __m256 {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
    let b = _mm_setzero_si128().as_i64x2();
-    let dst: i64x4 = simd_shuffle4(a.as_i64x2(), b, [0, 1, 2, 3]);
+    let dst: i64x4 = simd_shuffle4!(a.as_i64x2(), b, [0, 1, 2, 3]);
    transmute(dst)
 }

@@ -2687,7 +2687,7 @@ pub unsafe fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
 // instructions, thus it has zero latency.
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_zextpd128_pd256(a: __m128d) -> __m256d {
-    simd_shuffle4(a, _mm_setzero_pd(), [0, 1, 2, 3])
+    simd_shuffle4!(a, _mm_setzero_pd(), [0, 1, 2, 3])
 }

 /// Returns vector of type `__m256` with undefined elements.
@@ -2732,7 +2732,7 @@ pub unsafe fn _mm256_undefined_si256() -> __m256i {
 #[cfg_attr(test, assert_instr(vinsertf128))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_set_m128(hi: __m128, lo: __m128) -> __m256 {
-    simd_shuffle8(lo, hi, [0, 1, 2, 3, 4, 5, 6, 7])
+    simd_shuffle8!(lo, hi, [0, 1, 2, 3, 4, 5, 6, 7])
 }

 /// Sets packed __m256d returned vector with the supplied values.
@@ -175,7 +175,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
    let b = b.as_i8x32();

    let r: i8x32 = match IMM8 % 16 {
-        0 => simd_shuffle32(
+        0 => simd_shuffle32!(
            b,
            a,
            [
@@ -183,7 +183,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
                23, 24, 25, 26, 27, 28, 29, 30, 31,
            ],
        ),
-        1 => simd_shuffle32(
+        1 => simd_shuffle32!(
            b,
            a,
            [
@@ -191,7 +191,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
                24, 25, 26, 27, 28, 29, 30, 31, 48,
            ],
        ),
-        2 => simd_shuffle32(
+        2 => simd_shuffle32!(
            b,
            a,
            [
@@ -199,7 +199,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
                25, 26, 27, 28, 29, 30, 31, 48, 49,
            ],
        ),
-        3 => simd_shuffle32(
+        3 => simd_shuffle32!(
            b,
            a,
            [
@@ -207,7 +207,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
                25, 26, 27, 28, 29, 30, 31, 48, 49, 50,
            ],
        ),
-        4 => simd_shuffle32(
+        4 => simd_shuffle32!(
            b,
            a,
            [
@@ -215,7 +215,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
                26, 27, 28, 29, 30, 31, 48, 49, 50, 51,
            ],
        ),
-        5 => simd_shuffle32(
+        5 => simd_shuffle32!(
            b,
            a,
            [
@@ -223,7 +223,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
                27, 28, 29, 30, 31, 48, 49, 50, 51, 52,
            ],
        ),
-        6 => simd_shuffle32(
+        6 => simd_shuffle32!(
            b,
            a,
            [
@@ -231,7 +231,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
                28, 29, 30, 31, 48, 49, 50, 51, 52, 53,
            ],
        ),
-        7 => simd_shuffle32(
+        7 => simd_shuffle32!(
            b,
            a,
            [
@@ -239,7 +239,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
                28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54,
            ],
        ),
-        8 => simd_shuffle32(
+        8 => simd_shuffle32!(
            b,
            a,
            [
@@ -247,7 +247,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
                29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55,
            ],
        ),
-        9 => simd_shuffle32(
+        9 => simd_shuffle32!(
            b,
            a,
            [
@@ -255,7 +255,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
                30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56,
            ],
        ),
-        10 => simd_shuffle32(
+        10 => simd_shuffle32!(
            b,
            a,
            [
@@ -263,7 +263,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
                31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
            ],
        ),
-        11 => simd_shuffle32(
+        11 => simd_shuffle32!(
            b,
            a,
            [
@@ -271,7 +271,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
            ],
        ),
-        12 => simd_shuffle32(
+        12 => simd_shuffle32!(
            b,
            a,
            [
@@ -279,7 +279,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
                49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
            ],
        ),
-        13 => simd_shuffle32(
+        13 => simd_shuffle32!(
            b,
            a,
            [
@@ -287,7 +287,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
                50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
            ],
        ),
-        14 => simd_shuffle32(
+        14 => simd_shuffle32!(
            b,
            a,
            [
@@ -295,7 +295,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
                51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
            ],
        ),
-        15 => simd_shuffle32(
+        15 => simd_shuffle32!(
            b,
            a,
            [
@@ -370,10 +370,10 @@ pub unsafe fn _mm_blend_epi32<const IMM4: i32>(a: __m128i, b: __m128i) -> __m128
    static_assert_imm4!(IMM4);
    let a = a.as_i32x4();
    let b = b.as_i32x4();
-    let r: i32x4 = simd_shuffle4(
+    let r: i32x4 = simd_shuffle4!(
        a,
        b,
-        [
+        <const IMM4: i32> [
            [0, 4, 0, 4][IMM4 as usize & 0b11],
            [1, 1, 5, 5][IMM4 as usize & 0b11],
            [2, 6, 2, 6][(IMM4 as usize >> 2) & 0b11],
@@ -395,10 +395,10 @@ pub unsafe fn _mm256_blend_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
    static_assert_imm8!(IMM8);
    let a = a.as_i32x8();
    let b = b.as_i32x8();
-    let r: i32x8 = simd_shuffle8(
+    let r: i32x8 = simd_shuffle8!(
        a,
        b,
-        [
+        <const IMM8: i32> [
            [0, 8, 0, 8][IMM8 as usize & 0b11],
            [1, 1, 9, 9][IMM8 as usize & 0b11],
            [2, 10, 2, 10][(IMM8 as usize >> 2) & 0b11],
@@ -424,10 +424,11 @@ pub unsafe fn _mm256_blend_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
    static_assert_imm8!(IMM8);
    let a = a.as_i16x16();
    let b = b.as_i16x16();
-    let r: i16x16 = simd_shuffle16(
+
+    let r: i16x16 = simd_shuffle16!(
        a,
        b,
-        [
+        <const IMM8: i32> [
            [0, 16, 0, 16][IMM8 as usize & 0b11],
            [1, 1, 17, 17][IMM8 as usize & 0b11],
            [2, 18, 2, 18][(IMM8 as usize >> 2) & 0b11],
@@ -470,7 +471,7 @@ pub unsafe fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m25
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
    let zero = _mm_setzero_si128();
-    let ret = simd_shuffle16(a.as_i8x16(), zero.as_i8x16(), [0_u32; 16]);
+    let ret = simd_shuffle16!(a.as_i8x16(), zero.as_i8x16(), [0_u32; 16]);
    transmute::<i8x16, _>(ret)
 }

@@ -484,7 +485,7 @@ pub unsafe fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
    let zero = _mm_setzero_si128();
-    let ret = simd_shuffle32(a.as_i8x16(), zero.as_i8x16(), [0_u32; 32]);
+    let ret = simd_shuffle32!(a.as_i8x16(), zero.as_i8x16(), [0_u32; 32]);
    transmute::<i8x32, _>(ret)
 }

@@ -500,7 +501,7 @@ pub unsafe fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
    let zero = _mm_setzero_si128();
-    let ret = simd_shuffle4(a.as_i32x4(), zero.as_i32x4(), [0_u32; 4]);
+    let ret = simd_shuffle4!(a.as_i32x4(), zero.as_i32x4(), [0_u32; 4]);
    transmute::<i32x4, _>(ret)
 }

@@ -516,7 +517,7 @@ pub unsafe fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
    let zero = _mm_setzero_si128();
-    let ret = simd_shuffle8(a.as_i32x4(), zero.as_i32x4(), [0_u32; 8]);
+    let ret = simd_shuffle8!(a.as_i32x4(), zero.as_i32x4(), [0_u32; 8]);
    transmute::<i32x8, _>(ret)
 }

@@ -530,7 +531,7 @@ pub unsafe fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
 #[cfg_attr(test, assert_instr(vmovddup))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_broadcastq_epi64(a: __m128i) -> __m128i {
-    let ret = simd_shuffle2(a.as_i64x2(), a.as_i64x2(), [0_u32; 2]);
+    let ret = simd_shuffle2!(a.as_i64x2(), a.as_i64x2(), [0_u32; 2]);
    transmute::<i64x2, _>(ret)
 }

@@ -543,7 +544,7 @@ pub unsafe fn _mm_broadcastq_epi64(a: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(vbroadcastsd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
-    let ret = simd_shuffle4(a.as_i64x2(), a.as_i64x2(), [0_u32; 4]);
+    let ret = simd_shuffle4!(a.as_i64x2(), a.as_i64x2(), [0_u32; 4]);
    transmute::<i64x4, _>(ret)
 }

@@ -556,7 +557,7 @@ pub unsafe fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
 #[cfg_attr(test, assert_instr(vmovddup))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_broadcastsd_pd(a: __m128d) -> __m128d {
-    simd_shuffle2(a, _mm_setzero_pd(), [0_u32; 2])
+    simd_shuffle2!(a, _mm_setzero_pd(), [0_u32; 2])
 }

 /// Broadcasts the low double-precision (64-bit) floating-point element
@@ -568,7 +569,7 @@ pub unsafe fn _mm_broadcastsd_pd(a: __m128d) -> __m128d {
 #[cfg_attr(test, assert_instr(vbroadcastsd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
-    simd_shuffle4(a, _mm_setzero_pd(), [0_u32; 4])
+    simd_shuffle4!(a, _mm_setzero_pd(), [0_u32; 4])
 }

 // N.B., `broadcastsi128_si256` is often compiled to `vinsertf128` or
@@ -582,7 +583,7 @@ pub unsafe fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
    let zero = _mm_setzero_si128();
-    let ret = simd_shuffle4(a.as_i64x2(), zero.as_i64x2(), [0, 1, 0, 1]);
+    let ret = simd_shuffle4!(a.as_i64x2(), zero.as_i64x2(), [0, 1, 0, 1]);
    transmute::<i64x4, _>(ret)
 }

@@ -595,7 +596,7 @@ pub unsafe fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
 #[cfg_attr(test, assert_instr(vbroadcastss))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_broadcastss_ps(a: __m128) -> __m128 {
-    simd_shuffle4(a, _mm_setzero_ps(), [0_u32; 4])
+    simd_shuffle4!(a, _mm_setzero_ps(), [0_u32; 4])
 }

 /// Broadcasts the low single-precision (32-bit) floating-point element
@@ -607,7 +608,7 @@ pub unsafe fn _mm_broadcastss_ps(a: __m128) -> __m128 {
 #[cfg_attr(test, assert_instr(vbroadcastss))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_broadcastss_ps(a: __m128) -> __m256 {
-    simd_shuffle8(a, _mm_setzero_ps(), [0_u32; 8])
+    simd_shuffle8!(a, _mm_setzero_ps(), [0_u32; 8])
 }

 /// Broadcasts the low packed 16-bit integer from a to all elements of
@@ -620,7 +621,7 @@ pub unsafe fn _mm256_broadcastss_ps(a: __m128) -> __m256 {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
    let zero = _mm_setzero_si128();
-    let ret = simd_shuffle8(a.as_i16x8(), zero.as_i16x8(), [0_u32; 8]);
+    let ret = simd_shuffle8!(a.as_i16x8(), zero.as_i16x8(), [0_u32; 8]);
    transmute::<i16x8, _>(ret)
 }

@@ -634,7 +635,7 @@ pub unsafe fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i {
    let zero = _mm_setzero_si128();
-    let ret = simd_shuffle16(a.as_i16x8(), zero.as_i16x8(), [0_u32; 16]);
+    let ret = simd_shuffle16!(a.as_i16x8(), zero.as_i16x8(), [0_u32; 16]);
    transmute::<i16x16, _>(ret)
 }

@@ -746,7 +747,7 @@ pub unsafe fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i {
    let a = a.as_i16x8();
-    let v64: i16x4 = simd_shuffle4(a, a, [0, 1, 2, 3]);
+    let v64: i16x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
    transmute::<i64x4, _>(simd_cast(v64))
 }

@@ -781,7 +782,7 @@ pub unsafe fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i {
    let a = a.as_i8x16();
-    let v64: i8x8 = simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+    let v64: i8x8 = simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
    transmute::<i32x8, _>(simd_cast(v64))
 }

@@ -794,7 +795,7 @@ pub unsafe fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i {
    let a = a.as_i8x16();
-    let v32: i8x4 = simd_shuffle4(a, a, [0, 1, 2, 3]);
+    let v32: i8x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
    transmute::<i64x4, _>(simd_cast(v32))
 }

@@ -820,7 +821,7 @@ pub unsafe fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i {
    let a = a.as_u16x8();
-    let v64: u16x4 = simd_shuffle4(a, a, [0, 1, 2, 3]);
+    let v64: u16x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
    transmute::<i64x4, _>(simd_cast(v64))
 }

@@ -856,7 +857,7 @@ pub unsafe fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i {
    let a = a.as_u8x16();
-    let v64: u8x8 = simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+    let v64: u8x8 = simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
    transmute::<i32x8, _>(simd_cast(v64))
 }

@@ -870,7 +871,7 @@ pub unsafe fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i {
    let a = a.as_u8x16();
-    let v32: u8x4 = simd_shuffle4(a, a, [0, 1, 2, 3]);
+    let v32: u8x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
    transmute::<i64x4, _>(simd_cast(v32))
 }

@@ -889,7 +890,7 @@ pub unsafe fn _mm256_extracti128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
    static_assert_imm1!(IMM1);
    let a = a.as_i64x4();
    let b = _mm256_undefined_si256().as_i64x4();
-    let dst: i64x2 = simd_shuffle2(a, b, [[0, 1], [2, 3]][IMM1 as usize]);
+    let dst: i64x2 = simd_shuffle2!(a, b, <const IMM1: i32> [[0, 1], [2, 3]][IMM1 as usize]);
    transmute(dst)
 }

@@ -1711,7 +1712,8 @@ pub unsafe fn _mm256_inserti128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -
    static_assert_imm1!(IMM1);
    let a = a.as_i64x4();
    let b = _mm256_castsi128_si256(b).as_i64x4();
-    let dst: i64x4 = simd_shuffle4(a, b, [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize]);
+    let dst: i64x4 =
+        simd_shuffle4!(a, b, <const IMM1: i32> [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize]);
    transmute(dst)
 }

@@ -2200,10 +2202,10 @@ pub unsafe fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i {
 pub unsafe fn _mm256_permute4x64_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
    static_assert_imm8!(IMM8);
    let zero = _mm256_setzero_si256().as_i64x4();
-    let r: i64x4 = simd_shuffle4(
+    let r: i64x4 = simd_shuffle4!(
        a.as_i64x4(),
        zero,
-        [
+        <const IMM8: i32> [
            IMM8 as u32 & 0b11,
            (IMM8 as u32 >> 2) & 0b11,
            (IMM8 as u32 >> 4) & 0b11,
@@ -2237,10 +2239,10 @@ pub unsafe fn _mm256_permute2x128_si256<const IMM8: i32>(a: __m256i, b: __m256i)
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_permute4x64_pd<const IMM8: i32>(a: __m256d) -> __m256d {
    static_assert_imm8!(IMM8);
-    simd_shuffle4(
+    simd_shuffle4!(
        a,
        _mm256_undefined_pd(),
-        [
+        <const IMM8: i32> [
            IMM8 as u32 & 0b11,
            (IMM8 as u32 >> 2) & 0b11,
            (IMM8 as u32 >> 4) & 0b11,
@@ -2350,10 +2352,10 @@ pub unsafe fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_shuffle_epi32<const MASK: i32>(a: __m256i) -> __m256i {
    static_assert_imm8!(MASK);
-    let r: i32x8 = simd_shuffle8(
+    let r: i32x8 = simd_shuffle8!(
        a.as_i32x8(),
        a.as_i32x8(),
-        [
+        <const MASK: i32> [
            MASK as u32 & 0b11,
            (MASK as u32 >> 2) & 0b11,
            (MASK as u32 >> 4) & 0b11,
@@ -2380,10 +2382,10 @@ pub unsafe fn _mm256_shuffle_epi32<const MASK: i32>(a: __m256i) -> __m256i {
 pub unsafe fn _mm256_shufflehi_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
    static_assert_imm8!(IMM8);
    let a = a.as_i16x16();
-    let r: i16x16 = simd_shuffle16(
+    let r: i16x16 = simd_shuffle16!(
        a,
        a,
-        [
+        <const IMM8: i32> [
            0,
            1,
            2,
@@ -2418,10 +2420,10 @@ pub unsafe fn _mm256_shufflehi_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
 pub unsafe fn _mm256_shufflelo_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
    static_assert_imm8!(IMM8);
    let a = a.as_i16x16();
-    let r: i16x16 = simd_shuffle16(
+    let r: i16x16 = simd_shuffle16!(
        a,
        a,
-        [
+        <const IMM8: i32> [
            0 + (IMM8 as u32 & 0b11),
            0 + ((IMM8 as u32 >> 2) & 0b11),
            0 + ((IMM8 as u32 >> 4) & 0b11),
@@ -2585,10 +2587,10 @@ pub unsafe fn _mm256_bslli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
    static_assert_imm8!(IMM8);
    let a = a.as_i8x32();
    let zero = _mm256_setzero_si256().as_i8x32();
-    let r: i8x32 = simd_shuffle32(
+    let r: i8x32 = simd_shuffle32!(
        zero,
        a,
-        [
+        <const IMM8: i32> [
            32 - (IMM8 as u32 & 0xff),
            33 - (IMM8 as u32 & 0xff),
            34 - (IMM8 as u32 & 0xff),
@@ -2780,7 +2782,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
    let a = a.as_i8x32();
    let zero = _mm256_setzero_si256().as_i8x32();
    let r: i8x32 = match IMM8 % 16 {
-        0 => simd_shuffle32(
+        0 => simd_shuffle32!(
            a,
            zero,
            [
@@ -2788,7 +2790,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
                23, 24, 25, 26, 27, 28, 29, 30, 31,
            ],
        ),
-        1 => simd_shuffle32(
+        1 => simd_shuffle32!(
            a,
            zero,
            [
@@ -2796,7 +2798,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
                24, 25, 26, 27, 28, 29, 30, 31, 32,
            ],
        ),
-        2 => simd_shuffle32(
+        2 => simd_shuffle32!(
            a,
            zero,
            [
@@ -2804,7 +2806,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
                25, 26, 27, 28, 29, 30, 31, 32, 32,
            ],
        ),
-        3 => simd_shuffle32(
+        3 => simd_shuffle32!(
            a,
            zero,
            [
@@ -2812,7 +2814,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
                25, 26, 27, 28, 29, 30, 31, 32, 32, 32,
            ],
        ),
-        4 => simd_shuffle32(
+        4 => simd_shuffle32!(
            a,
            zero,
            [
@@ -2820,7 +2822,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
                26, 27, 28, 29, 30, 31, 32, 32, 32, 32,
            ],
        ),
-        5 => simd_shuffle32(
+        5 => simd_shuffle32!(
            a,
            zero,
            [
@@ -2828,7 +2830,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
                27, 28, 29, 30, 31, 32, 32, 32, 32, 32,
            ],
        ),
-        6 => simd_shuffle32(
+        6 => simd_shuffle32!(
            a,
            zero,
            [
@@ -2836,7 +2838,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
                28, 29, 30, 31, 32, 32, 32, 32, 32, 32,
            ],
        ),
-        7 => simd_shuffle32(
+        7 => simd_shuffle32!(
            a,
            zero,
            [
@@ -2844,7 +2846,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
                28, 29, 30, 31, 32, 32, 32, 32, 32, 32, 32,
            ],
        ),
-        8 => simd_shuffle32(
+        8 => simd_shuffle32!(
            a,
            zero,
            [
@@ -2852,7 +2854,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
                29, 30, 31, 32, 32, 32, 32, 32, 32, 32, 32,
            ],
        ),
-        9 => simd_shuffle32(
+        9 => simd_shuffle32!(
            a,
            zero,
            [
@@ -2860,7 +2862,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
                30, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
            ],
        ),
-        10 => simd_shuffle32(
+        10 => simd_shuffle32!(
            a,
            zero,
            [
@@ -2868,7 +2870,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
                31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
            ],
        ),
-        11 => simd_shuffle32(
+        11 => simd_shuffle32!(
            a,
            zero,
            [
@@ -2876,7 +2878,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
                32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
            ],
        ),
-        12 => simd_shuffle32(
+        12 => simd_shuffle32!(
            a,
            zero,
            [
@@ -2884,7 +2886,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
                32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
            ],
        ),
-        13 => simd_shuffle32(
+        13 => simd_shuffle32!(
            a,
            zero,
            [
@@ -2892,7 +2894,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
                32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
            ],
        ),
-        14 => simd_shuffle32(
+        14 => simd_shuffle32!(
            a,
            zero,
            [
@@ -2900,7 +2902,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
                32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
            ],
        ),
-        15 => simd_shuffle32(
+        15 => simd_shuffle32!(
            a,
            zero,
            [
@@ -3178,7 +3180,7 @@ pub unsafe fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i {
    #[rustfmt::skip]
-    let r: i8x32 = simd_shuffle32(a.as_i8x32(), b.as_i8x32(), [
+    let r: i8x32 = simd_shuffle32!(a.as_i8x32(), b.as_i8x32(), [
            8, 40, 9, 41, 10, 42, 11, 43,
            12, 44, 13, 45, 14, 46, 15, 47,
            24, 56, 25, 57, 26, 58, 27, 59,
@@ -3231,7 +3233,7 @@ pub unsafe fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i {
    #[rustfmt::skip]
-    let r: i8x32 = simd_shuffle32(a.as_i8x32(), b.as_i8x32(), [
+    let r: i8x32 = simd_shuffle32!(a.as_i8x32(), b.as_i8x32(), [
        0, 32, 1, 33, 2, 34, 3, 35,
        4, 36, 5, 37, 6, 38, 7, 39,
        16, 48, 17, 49, 18, 50, 19, 51,
@@ -3279,7 +3281,7 @@ pub unsafe fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpunpckhwd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
-    let r: i16x16 = simd_shuffle16(
+    let r: i16x16 = simd_shuffle16!(
        a.as_i16x16(),
        b.as_i16x16(),
        [4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31],
@@ -3327,7 +3329,7 @@ pub unsafe fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpunpcklwd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
-    let r: i16x16 = simd_shuffle16(
+    let r: i16x16 = simd_shuffle16!(
        a.as_i16x16(),
        b.as_i16x16(),
        [0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27],
@@ -3368,7 +3370,7 @@ pub unsafe fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vunpckhps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
-    let r: i32x8 = simd_shuffle8(a.as_i32x8(), b.as_i32x8(), [2, 10, 3, 11, 6, 14, 7, 15]);
+    let r: i32x8 = simd_shuffle8!(a.as_i32x8(), b.as_i32x8(), [2, 10, 3, 11, 6, 14, 7, 15]);
    transmute(r)
 }

@@ -3405,7 +3407,7 @@ pub unsafe fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vunpcklps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
-    let r: i32x8 = simd_shuffle8(a.as_i32x8(), b.as_i32x8(), [0, 8, 1, 9, 4, 12, 5, 13]);
+    let r: i32x8 = simd_shuffle8!(a.as_i32x8(), b.as_i32x8(), [0, 8, 1, 9, 4, 12, 5, 13]);
    transmute(r)
 }

@@ -3442,7 +3444,7 @@ pub unsafe fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vunpckhpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
-    let r: i64x4 = simd_shuffle4(a.as_i64x4(), b.as_i64x4(), [1, 5, 3, 7]);
+    let r: i64x4 = simd_shuffle4!(a.as_i64x4(), b.as_i64x4(), [1, 5, 3, 7]);
    transmute(r)
 }

@@ -3479,7 +3481,7 @@ pub unsafe fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vunpcklpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i {
-    let r: i64x4 = simd_shuffle4(a.as_i64x4(), b.as_i64x4(), [0, 4, 2, 6]);
+    let r: i64x4 = simd_shuffle4!(a.as_i64x4(), b.as_i64x4(), [0, 4, 2, 6]);
    transmute(r)
 }

@@ -6218,7 +6218,7 @@ pub unsafe fn _mm_mask_blend_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m12
 #[cfg_attr(test, assert_instr(vpbroadcastw))]
 pub unsafe fn _mm512_broadcastw_epi16(a: __m128i) -> __m512i {
    let a = _mm512_castsi128_si512(a).as_i16x32();
-    let ret: i16x32 = simd_shuffle32(
+    let ret: i16x32 = simd_shuffle32!(
        a,
        a,
        [
@@ -6306,7 +6306,7 @@ pub unsafe fn _mm_maskz_broadcastw_epi16(k: __mmask8, a: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(vpbroadcastb))]
 pub unsafe fn _mm512_broadcastb_epi8(a: __m128i) -> __m512i {
    let a = _mm512_castsi128_si512(a).as_i8x64();
-    let ret: i8x64 = simd_shuffle64(
+    let ret: i8x64 = simd_shuffle64!(
        a,
        a,
        [
@@ -6397,7 +6397,7 @@ pub unsafe fn _mm512_unpackhi_epi16(a: __m512i, b: __m512i) -> __m512i {
    let a = a.as_i16x32();
    let b = b.as_i16x32();
    #[rustfmt::skip]
-    let r: i16x32 = simd_shuffle32(
+    let r: i16x32 = simd_shuffle32!(
        a,
        b,
        [
@@ -6508,7 +6508,7 @@ pub unsafe fn _mm512_unpackhi_epi8(a: __m512i, b: __m512i) -> __m512i {
    let a = a.as_i8x64();
    let b = b.as_i8x64();
    #[rustfmt::skip]
-    let r: i8x64 = simd_shuffle64(
+    let r: i8x64 = simd_shuffle64!(
        a,
        b,
        [
@@ -6627,7 +6627,7 @@ pub unsafe fn _mm512_unpacklo_epi16(a: __m512i, b: __m512i) -> __m512i {
    let a = a.as_i16x32();
    let b = b.as_i16x32();
    #[rustfmt::skip]
-    let r: i16x32 = simd_shuffle32(
+    let r: i16x32 = simd_shuffle32!(
        a,
        b,
        [
@@ -6738,7 +6738,7 @@ pub unsafe fn _mm512_unpacklo_epi8(a: __m512i, b: __m512i) -> __m512i {
    let a = a.as_i8x64();
    let b = b.as_i8x64();
    #[rustfmt::skip]
-    let r: i8x64 = simd_shuffle64(
+    let r: i8x64 = simd_shuffle64!(
        a,
        b,
        [
@@ -7133,10 +7133,10 @@ pub unsafe fn _mm_maskz_set1_epi8(k: __mmask16, a: i8) -> __m128i {
 pub unsafe fn _mm512_shufflelo_epi16<const IMM8: i32>(a: __m512i) -> __m512i {
    static_assert_imm8!(IMM8);
    let a = a.as_i16x32();
-    let r: i16x32 = simd_shuffle32(
+    let r: i16x32 = simd_shuffle32!(
        a,
        a,
-        [
+        <const IMM8: i32> [
            IMM8 as u32 & 0b11,
            (IMM8 as u32 >> 2) & 0b11,
            (IMM8 as u32 >> 4) & 0b11,
@@ -7277,10 +7277,10 @@ pub unsafe fn _mm_maskz_shufflelo_epi16<const IMM8: i32>(k: __mmask8, a: __m128i
 pub unsafe fn _mm512_shufflehi_epi16<const IMM8: i32>(a: __m512i) -> __m512i {
    static_assert_imm8!(IMM8);
    let a = a.as_i16x32();
-    let r: i16x32 = simd_shuffle32(
+    let r: i16x32 = simd_shuffle32!(
        a,
        a,
-        [
+        <const IMM8: i32> [
            0,
            1,
            2,
@@ -8433,7 +8433,7 @@ pub unsafe fn _mm256_maskz_cvtepi16_epi8(k: __mmask16, a: __m256i) -> __m128i {
 pub unsafe fn _mm_cvtepi16_epi8(a: __m128i) -> __m128i {
    let a = a.as_i16x8();
    let zero = _mm_setzero_si128().as_i16x8();
-    let v256: i16x16 = simd_shuffle16(a, zero, [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8]);
+    let v256: i16x16 = simd_shuffle16!(a, zero, [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8]);
    transmute::<i8x16, _>(simd_cast(v256))
 }

@@ -8875,10 +8875,10 @@ pub unsafe fn _mm512_bslli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
    static_assert_imm8!(IMM8);
    let a = a.as_i8x64();
    let zero = _mm512_setzero_si512().as_i8x64();
-    let r: i8x64 = simd_shuffle64(
+    let r: i8x64 = simd_shuffle64!(
        zero,
        a,
-        [
+        <const IMM8: i32> [
            64 - (IMM8 as u32 & 0xff),
            65 - (IMM8 as u32 & 0xff),
            66 - (IMM8 as u32 & 0xff),
@@ -8960,7 +8960,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
    let a = a.as_i8x64();
    let zero = _mm512_setzero_si512().as_i8x64();
    let r: i8x64 = match IMM8 % 16 {
-        0 => simd_shuffle64(
+        0 => simd_shuffle64!(
            a,
            zero,
            [
@@ -8969,7 +8969,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
                44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
            ],
        ),
-        1 => simd_shuffle64(
+        1 => simd_shuffle64!(
            a,
            zero,
            [
@@ -8978,7 +8978,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
                45, 46, 47, 96, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112,
            ],
        ),
-        2 => simd_shuffle64(
+        2 => simd_shuffle64!(
            a,
            zero,
            [
@@ -8987,7 +8987,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
                46, 47, 96, 97, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113,
            ],
        ),
-        3 => simd_shuffle64(
+        3 => simd_shuffle64!(
            a,
            zero,
            [
@@ -8997,7 +8997,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
                114,
            ],
        ),
-        4 => simd_shuffle64(
+        4 => simd_shuffle64!(
            a,
            zero,
            [
@@ -9007,7 +9007,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
                115,
            ],
        ),
-        5 => simd_shuffle64(
+        5 => simd_shuffle64!(
            a,
            zero,
            [
@@ -9017,7 +9017,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
                115, 116,
            ],
        ),
-        6 => simd_shuffle64(
+        6 => simd_shuffle64!(
            a,
            zero,
            [
@@ -9027,7 +9027,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
                116, 117,
            ],
        ),
-        7 => simd_shuffle64(
+        7 => simd_shuffle64!(
            a,
            zero,
            [
@@ -9037,7 +9037,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
                116, 117, 118,
            ],
        ),
-        8 => simd_shuffle64(
+        8 => simd_shuffle64!(
            a,
            zero,
            [
@@ -9047,7 +9047,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
                116, 117, 118, 119,
            ],
        ),
-        9 => simd_shuffle64(
+        9 => simd_shuffle64!(
            a,
            zero,
            [
@@ -9057,7 +9057,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
                117, 118, 119, 120,
            ],
        ),
-        10 => simd_shuffle64(
+        10 => simd_shuffle64!(
            a,
            zero,
            [
@@ -9067,7 +9067,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
                118, 119, 120, 121,
            ],
        ),
-        11 => simd_shuffle64(
+        11 => simd_shuffle64!(
            a,
            zero,
            [
@@ -9077,7 +9077,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
                117, 118, 119, 120, 121, 122,
            ],
        ),
-        12 => simd_shuffle64(
+        12 => simd_shuffle64!(
            a,
            zero,
            [
@@ -9087,7 +9087,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
                118, 119, 120, 121, 122, 123,
            ],
        ),
-        13 => simd_shuffle64(
+        13 => simd_shuffle64!(
            a,
            zero,
            [
@@ -9097,7 +9097,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
                119, 120, 121, 122, 123, 124,
            ],
        ),
-        14 => simd_shuffle64(
+        14 => simd_shuffle64!(
            a,
            zero,
            [
@@ -9107,7 +9107,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
                120, 121, 122, 123, 124, 125,
            ],
        ),
-        15 => simd_shuffle64(
+        15 => simd_shuffle64!(
            a,
            zero,
            [
@@ -9146,7 +9146,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
    let b = b.as_i8x64();

    let r: i8x64 = match IMM8 % 16 {
-        0 => simd_shuffle64(
+        0 => simd_shuffle64!(
            b,
            a,
            [
@@ -9155,7 +9155,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
                44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
            ],
        ),
-        1 => simd_shuffle64(
+        1 => simd_shuffle64!(
            b,
            a,
            [
@@ -9164,7 +9164,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
                45, 46, 47, 96, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112,
            ],
        ),
-        2 => simd_shuffle64(
+        2 => simd_shuffle64!(
            b,
            a,
            [
@@ -9173,7 +9173,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
                46, 47, 96, 97, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113,
            ],
        ),
-        3 => simd_shuffle64(
+        3 => simd_shuffle64!(
            b,
            a,
            [
@@ -9183,7 +9183,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
                114,
            ],
        ),
-        4 => simd_shuffle64(
+        4 => simd_shuffle64!(
            b,
            a,
            [
@@ -9193,7 +9193,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
                115,
            ],
        ),
-        5 => simd_shuffle64(
+        5 => simd_shuffle64!(
            b,
            a,
            [
@@ -9203,7 +9203,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
                115, 116,
            ],
        ),
-        6 => simd_shuffle64(
+        6 => simd_shuffle64!(
            b,
            a,
            [
@@ -9213,7 +9213,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
                116, 117,
            ],
        ),
-        7 => simd_shuffle64(
+        7 => simd_shuffle64!(
            b,
            a,
            [
@@ -9223,7 +9223,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
                116, 117, 118,
            ],
        ),
-        8 => simd_shuffle64(
+        8 => simd_shuffle64!(
            b,
            a,
            [
@@ -9233,7 +9233,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
                116, 117, 118, 119,
            ],
        ),
-        9 => simd_shuffle64(
+        9 => simd_shuffle64!(
            b,
            a,
            [
@@ -9243,7 +9243,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
                117, 118, 119, 120,
            ],
        ),
-        10 => simd_shuffle64(
+        10 => simd_shuffle64!(
            b,
            a,
            [
@@ -9253,7 +9253,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
                118, 119, 120, 121,
            ],
        ),
-        11 => simd_shuffle64(
+        11 => simd_shuffle64!(
            b,
            a,
            [
@@ -9263,7 +9263,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
                117, 118, 119, 120, 121, 122,
            ],
        ),
-        12 => simd_shuffle64(
+        12 => simd_shuffle64!(
            b,
            a,
            [
@@ -9273,7 +9273,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
                118, 119, 120, 121, 122, 123,
            ],
        ),
-        13 => simd_shuffle64(
+        13 => simd_shuffle64!(
            b,
            a,
            [
@@ -9283,7 +9283,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
                119, 120, 121, 122, 123, 124,
            ],
        ),
-        14 => simd_shuffle64(
+        14 => simd_shuffle64!(
            b,
            a,
            [
@@ -9293,7 +9293,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
                120, 121, 122, 123, 124, 125,
            ],
        ),
-        15 => simd_shuffle64(
+        15 => simd_shuffle64!(
            b,
            a,
            [
@@ -10529,7 +10529,7 @@ pub unsafe fn _mm512_cvtpd_pslo(v2: __m512d) -> __m512 {
        0b11111111,
        _MM_FROUND_CUR_DIRECTION,
    );
-    simd_shuffle16(
+    simd_shuffle16!(
        r,
        _mm256_setzero_ps().as_f32x8(),
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8],
@@ -10549,7 +10549,7 @@ pub unsafe fn _mm512_mask_cvtpd_pslo(src: __m512, k: __mmask8, v2: __m512d) -> _
        k,
        _MM_FROUND_CUR_DIRECTION,
    );
-    simd_shuffle16(
+    simd_shuffle16!(
        r,
        _mm256_setzero_ps().as_f32x8(),
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8],
@@ -10644,7 +10644,7 @@ pub unsafe fn _mm_maskz_cvtepi8_epi32(k: __mmask8, a: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(vpmovsxbq))]
 pub unsafe fn _mm512_cvtepi8_epi64(a: __m128i) -> __m512i {
    let a = a.as_i8x16();
-    let v64: i8x8 = simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+    let v64: i8x8 = simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
    transmute::<i64x8, _>(simd_cast(v64))
 }

@@ -10805,7 +10805,7 @@ pub unsafe fn _mm_maskz_cvtepu8_epi32(k: __mmask8, a: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(vpmovzxbq))]
 pub unsafe fn _mm512_cvtepu8_epi64(a: __m128i) -> __m512i {
    let a = a.as_u8x16();
-    let v64: u8x8 = simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+    let v64: u8x8 = simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
    transmute::<i64x8, _>(simd_cast(v64))
 }

@@ -11628,7 +11628,7 @@ pub unsafe fn _mm256_maskz_cvtepu32_pd(k: __mmask8, a: __m128i) -> __m256d {
 #[cfg_attr(test, assert_instr(vcvtudq2pd))]
 pub unsafe fn _mm_cvtepu32_pd(a: __m128i) -> __m128d {
    let a = a.as_u32x4();
-    let u64: u32x2 = simd_shuffle2(a, a, [0, 1]);
+    let u64: u32x2 = simd_shuffle2!(a, a, [0, 1]);
    transmute::<f64x2, _>(simd_cast(u64))
 }

@@ -11663,7 +11663,7 @@ pub unsafe fn _mm_maskz_cvtepu32_pd(k: __mmask8, a: __m128i) -> __m128d {
 #[cfg_attr(test, assert_instr(vcvtdq2pd))]
 pub unsafe fn _mm512_cvtepi32lo_pd(v2: __m512i) -> __m512d {
    let v2 = v2.as_i32x16();
-    let v256: i32x8 = simd_shuffle8(v2, v2, [0, 1, 2, 3, 4, 5, 6, 7]);
+    let v256: i32x8 = simd_shuffle8!(v2, v2, [0, 1, 2, 3, 4, 5, 6, 7]);
    transmute::<f64x8, _>(simd_cast(v256))
 }

@@ -11686,7 +11686,7 @@ pub unsafe fn _mm512_mask_cvtepi32lo_pd(src: __m512d, k: __mmask8, v2: __m512i)
 #[cfg_attr(test, assert_instr(vcvtudq2pd))]
 pub unsafe fn _mm512_cvtepu32lo_pd(v2: __m512i) -> __m512d {
    let v2 = v2.as_u32x16();
-    let v256: u32x8 = simd_shuffle8(v2, v2, [0, 1, 2, 3, 4, 5, 6, 7]);
+    let v256: u32x8 = simd_shuffle8!(v2, v2, [0, 1, 2, 3, 4, 5, 6, 7]);
    transmute::<f64x8, _>(simd_cast(v256))
 }

@@ -19215,10 +19215,10 @@ pub unsafe fn _mm_maskz_srlv_epi64(k: __mmask8, a: __m128i, count: __m128i) -> _
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm512_permute_ps<const MASK: i32>(a: __m512) -> __m512 {
    static_assert_imm8!(MASK);
-    simd_shuffle16(
+    simd_shuffle16!(
        a,
        a,
-        [
+        <const MASK: i32> [
            MASK as u32 & 0b11,
            (MASK as u32 >> 2) & 0b11,
            ((MASK as u32 >> 4) & 0b11),
@@ -19333,10 +19333,10 @@ pub unsafe fn _mm_maskz_permute_ps<const MASK: i32>(k: __mmask8, a: __m128) -> _
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm512_permute_pd<const MASK: i32>(a: __m512d) -> __m512d {
    static_assert_imm8!(MASK);
-    simd_shuffle8(
+    simd_shuffle8!(
        a,
        a,
-        [
+        <const MASK: i32> [
            MASK as u32 & 0b1,
            ((MASK as u32 >> 1) & 0b1),
            ((MASK as u32 >> 2) & 0b1) + 2,
@@ -19451,10 +19451,10 @@ pub unsafe fn _mm_maskz_permute_pd<const IMM2: i32>(k: __mmask8, a: __m128d) ->
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm512_permutex_epi64<const MASK: i32>(a: __m512i) -> __m512i {
    static_assert_imm8!(MASK);
-    simd_shuffle8(
+    simd_shuffle8!(
        a,
        a,
-        [
+        <const MASK: i32> [
            MASK as u32 & 0b11,
            (MASK as u32 >> 2) & 0b11,
            ((MASK as u32 >> 4) & 0b11),
@@ -19507,10 +19507,10 @@ pub unsafe fn _mm512_maskz_permutex_epi64<const MASK: i32>(k: __mmask8, a: __m51
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm256_permutex_epi64<const MASK: i32>(a: __m256i) -> __m256i {
    static_assert_imm8!(MASK);
-    simd_shuffle4(
+    simd_shuffle4!(
        a,
        a,
-        [
+        <const MASK: i32> [
            MASK as u32 & 0b11,
            (MASK as u32 >> 2) & 0b11,
            ((MASK as u32 >> 4) & 0b11),
@@ -19559,10 +19559,10 @@ pub unsafe fn _mm256_maskz_permutex_epi64<const MASK: i32>(k: __mmask8, a: __m25
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm512_permutex_pd<const MASK: i32>(a: __m512d) -> __m512d {
    static_assert_imm8!(MASK);
-    simd_shuffle8(
+    simd_shuffle8!(
        a,
        a,
-        [
+        <const MASK: i32> [
            MASK as u32 & 0b11,
            (MASK as u32 >> 2) & 0b11,
            ((MASK as u32 >> 4) & 0b11),
@@ -19613,10 +19613,10 @@ pub unsafe fn _mm512_maskz_permutex_pd<const MASK: i32>(k: __mmask8, a: __m512d)
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm256_permutex_pd<const MASK: i32>(a: __m256d) -> __m256d {
    static_assert_imm8!(MASK);
-    simd_shuffle4(
+    simd_shuffle4!(
        a,
        a,
-        [
+        <const MASK: i32> [
            MASK as u32 & 0b11,
            (MASK as u32 >> 2) & 0b11,
            ((MASK as u32 >> 4) & 0b11),
@@ -20867,10 +20867,10 @@ pub unsafe fn _mm_mask2_permutex2var_pd(
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm512_shuffle_epi32<const MASK: _MM_PERM_ENUM>(a: __m512i) -> __m512i {
    static_assert_imm8!(MASK);
-    let r: i32x16 = simd_shuffle16(
+    let r: i32x16 = simd_shuffle16!(
        a.as_i32x16(),
        a.as_i32x16(),
-        [
+        <const MASK: _MM_PERM_ENUM> [
            MASK as u32 & 0b11,
            (MASK as u32 >> 2) & 0b11,
            (MASK as u32 >> 4) & 0b11,
@@ -21003,10 +21003,10 @@ pub unsafe fn _mm_maskz_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm512_shuffle_ps<const MASK: i32>(a: __m512, b: __m512) -> __m512 {
    static_assert_imm8!(MASK);
-    simd_shuffle16(
+    simd_shuffle16!(
        a,
        b,
-        [
+        <const MASK: i32> [
            MASK as u32 & 0b11,
            (MASK as u32 >> 2) & 0b11,
            ((MASK as u32 >> 4) & 0b11) + 16,
@@ -21140,10 +21140,10 @@ pub unsafe fn _mm_maskz_shuffle_ps<const MASK: i32>(k: __mmask8, a: __m128, b: _
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm512_shuffle_pd<const MASK: i32>(a: __m512d, b: __m512d) -> __m512d {
    static_assert_imm8!(MASK);
-    simd_shuffle8(
+    simd_shuffle8!(
        a,
        b,
-        [
+        <const MASK: i32> [
            MASK as u32 & 0b1,
            ((MASK as u32 >> 1) & 0b1) + 8,
            ((MASK as u32 >> 2) & 0b1) + 2,
@@ -21275,10 +21275,10 @@ pub unsafe fn _mm512_shuffle_i32x4<const MASK: i32>(a: __m512i, b: __m512i) -> _
    static_assert_imm8!(MASK);
    let a = a.as_i32x16();
    let b = b.as_i32x16();
-    let r: i32x16 = simd_shuffle16(
+    let r: i32x16 = simd_shuffle16!(
        a,
        b,
-        [
+        <const MASK: i32> [
            (MASK as u32 & 0b11) * 4 + 0,
            (MASK as u32 & 0b11) * 4 + 1,
            (MASK as u32 & 0b11) * 4 + 2,
@@ -21347,10 +21347,10 @@ pub unsafe fn _mm256_shuffle_i32x4<const MASK: i32>(a: __m256i, b: __m256i) -> _
    static_assert_imm8!(MASK);
    let a = a.as_i32x8();
    let b = b.as_i32x8();
-    let r: i32x8 = simd_shuffle8(
+    let r: i32x8 = simd_shuffle8!(
        a,
        b,
-        [
+        <const MASK: i32> [
            (MASK as u32 & 0b1) * 4 + 0,
            (MASK as u32 & 0b1) * 4 + 1,
            (MASK as u32 & 0b1) * 4 + 2,
@@ -21411,10 +21411,10 @@ pub unsafe fn _mm512_shuffle_i64x2<const MASK: i32>(a: __m512i, b: __m512i) -> _
    static_assert_imm8!(MASK);
    let a = a.as_i64x8();
    let b = b.as_i64x8();
-    let r: i64x8 = simd_shuffle8(
+    let r: i64x8 = simd_shuffle8!(
        a,
        b,
-        [
+        <const MASK: i32> [
            (MASK as u32 & 0b11) * 2 + 0,
            (MASK as u32 & 0b11) * 2 + 1,
            ((MASK as u32 >> 2) & 0b11) * 2 + 0,
@@ -21475,10 +21475,10 @@ pub unsafe fn _mm256_shuffle_i64x2<const MASK: i32>(a: __m256i, b: __m256i) -> _
    static_assert_imm8!(MASK);
    let a = a.as_i64x4();
    let b = b.as_i64x4();
-    let r: i64x4 = simd_shuffle4(
+    let r: i64x4 = simd_shuffle4!(
        a,
        b,
-        [
+        <const MASK: i32> [
            (MASK as u32 & 0b1) * 2 + 0,
            (MASK as u32 & 0b1) * 2 + 1,
            ((MASK as u32 >> 1) & 0b1) * 2 + 0 + 4,
@@ -21535,10 +21535,10 @@ pub unsafe fn _mm512_shuffle_f32x4<const MASK: i32>(a: __m512, b: __m512) -> __m
    static_assert_imm8!(MASK);
    let a = a.as_f32x16();
    let b = b.as_f32x16();
-    let r: f32x16 = simd_shuffle16(
+    let r: f32x16 = simd_shuffle16!(
        a,
        b,
-        [
+        <const MASK: i32> [
            (MASK as u32 & 0b11) * 4 + 0,
            (MASK as u32 & 0b11) * 4 + 1,
            (MASK as u32 & 0b11) * 4 + 2,
@@ -21607,10 +21607,10 @@ pub unsafe fn _mm256_shuffle_f32x4<const MASK: i32>(a: __m256, b: __m256) -> __m
    static_assert_imm8!(MASK);
    let a = a.as_f32x8();
    let b = b.as_f32x8();
-    let r: f32x8 = simd_shuffle8(
+    let r: f32x8 = simd_shuffle8!(
        a,
        b,
-        [
+        <const MASK: i32> [
            (MASK as u32 & 0b1) * 4 + 0,
            (MASK as u32 & 0b1) * 4 + 1,
            (MASK as u32 & 0b1) * 4 + 2,
@@ -21671,10 +21671,10 @@ pub unsafe fn _mm512_shuffle_f64x2<const MASK: i32>(a: __m512d, b: __m512d) -> _
    static_assert_imm8!(MASK);
    let a = a.as_f64x8();
    let b = b.as_f64x8();
-    let r: f64x8 = simd_shuffle8(
+    let r: f64x8 = simd_shuffle8!(
        a,
        b,
-        [
+        <const MASK: i32> [
            (MASK as u32 & 0b11) * 2 + 0,
            (MASK as u32 & 0b11) * 2 + 1,
            ((MASK as u32 >> 2) & 0b11) * 2 + 0,
@@ -21735,10 +21735,10 @@ pub unsafe fn _mm256_shuffle_f64x2<const MASK: i32>(a: __m256d, b: __m256d) -> _
    static_assert_imm8!(MASK);
    let a = a.as_f64x4();
    let b = b.as_f64x4();
-    let r: f64x4 = simd_shuffle4(
+    let r: f64x4 = simd_shuffle4!(
        a,
        b,
-        [
+        <const MASK: i32> [
            (MASK as u32 & 0b1) * 2 + 0,
            (MASK as u32 & 0b1) * 2 + 1,
            ((MASK as u32 >> 1) & 0b1) * 2 + 0 + 4,
@@ -21797,10 +21797,10 @@ pub unsafe fn _mm256_maskz_shuffle_f64x2<const MASK: i32>(
 pub unsafe fn _mm512_extractf32x4_ps<const IMM8: i32>(a: __m512) -> __m128 {
    static_assert_imm2!(IMM8);
    match IMM8 & 0x3 {
-        0 => simd_shuffle4(a, _mm512_undefined_ps(), [0, 1, 2, 3]),
-        1 => simd_shuffle4(a, _mm512_undefined_ps(), [4, 5, 6, 7]),
-        2 => simd_shuffle4(a, _mm512_undefined_ps(), [8, 9, 10, 11]),
-        _ => simd_shuffle4(a, _mm512_undefined_ps(), [12, 13, 14, 15]),
+        0 => simd_shuffle4!(a, _mm512_undefined_ps(), [0, 1, 2, 3]),
+        1 => simd_shuffle4!(a, _mm512_undefined_ps(), [4, 5, 6, 7]),
+        2 => simd_shuffle4!(a, _mm512_undefined_ps(), [8, 9, 10, 11]),
+        _ => simd_shuffle4!(a, _mm512_undefined_ps(), [12, 13, 14, 15]),
    }
 }

@@ -21854,8 +21854,8 @@ pub unsafe fn _mm512_maskz_extractf32x4_ps<const IMM8: i32>(k: __mmask8, a: __m5
 pub unsafe fn _mm256_extractf32x4_ps<const IMM8: i32>(a: __m256) -> __m128 {
    static_assert_imm1!(IMM8);
    match IMM8 & 0x1 {
-        0 => simd_shuffle4(a, _mm256_undefined_ps(), [0, 1, 2, 3]),
-        _ => simd_shuffle4(a, _mm256_undefined_ps(), [4, 5, 6, 7]),
+        0 => simd_shuffle4!(a, _mm256_undefined_ps(), [0, 1, 2, 3]),
+        _ => simd_shuffle4!(a, _mm256_undefined_ps(), [4, 5, 6, 7]),
    }
 }

@@ -21909,8 +21909,8 @@ pub unsafe fn _mm256_maskz_extractf32x4_ps<const IMM8: i32>(k: __mmask8, a: __m2
 pub unsafe fn _mm512_extracti64x4_epi64<const IMM1: i32>(a: __m512i) -> __m256i {
    static_assert_imm1!(IMM1);
    match IMM1 {
-        0 => simd_shuffle4(a, _mm512_set1_epi64(0), [0, 1, 2, 3]),
-        _ => simd_shuffle4(a, _mm512_set1_epi64(0), [4, 5, 6, 7]),
+        0 => simd_shuffle4!(a, _mm512_set1_epi64(0), [0, 1, 2, 3]),
+        _ => simd_shuffle4!(a, _mm512_set1_epi64(0), [4, 5, 6, 7]),
    }
 }

@@ -21964,8 +21964,8 @@ pub unsafe fn _mm512_maskz_extracti64x4_epi64<const IMM1: i32>(k: __mmask8, a: _
 pub unsafe fn _mm512_extractf64x4_pd<const IMM8: i32>(a: __m512d) -> __m256d {
    static_assert_imm1!(IMM8);
    match IMM8 & 0x1 {
-        0 => simd_shuffle4(a, _mm512_undefined_pd(), [0, 1, 2, 3]),
-        _ => simd_shuffle4(a, _mm512_undefined_pd(), [4, 5, 6, 7]),
+        0 => simd_shuffle4!(a, _mm512_undefined_pd(), [0, 1, 2, 3]),
+        _ => simd_shuffle4!(a, _mm512_undefined_pd(), [4, 5, 6, 7]),
    }
 }

@@ -22021,10 +22021,10 @@ pub unsafe fn _mm512_extracti32x4_epi32<const IMM2: i32>(a: __m512i) -> __m128i
    let a = a.as_i32x16();
    let undefined = _mm512_undefined_epi32().as_i32x16();
    let extract: i32x4 = match IMM2 {
-        0 => simd_shuffle4(a, undefined, [0, 1, 2, 3]),
-        1 => simd_shuffle4(a, undefined, [4, 5, 6, 7]),
-        2 => simd_shuffle4(a, undefined, [8, 9, 10, 11]),
-        _ => simd_shuffle4(a, undefined, [12, 13, 14, 15]),
+        0 => simd_shuffle4!(a, undefined, [0, 1, 2, 3]),
+        1 => simd_shuffle4!(a, undefined, [4, 5, 6, 7]),
+        2 => simd_shuffle4!(a, undefined, [8, 9, 10, 11]),
+        _ => simd_shuffle4!(a, undefined, [12, 13, 14, 15]),
    };
    transmute(extract)
 }
@@ -22081,8 +22081,8 @@ pub unsafe fn _mm256_extracti32x4_epi32<const IMM1: i32>(a: __m256i) -> __m128i
    let a = a.as_i32x8();
    let undefined = _mm256_undefined_si256().as_i32x8();
    let extract: i32x4 = match IMM1 {
-        0 => simd_shuffle4(a, undefined, [0, 1, 2, 3]),
-        _ => simd_shuffle4(a, undefined, [4, 5, 6, 7]),
+        0 => simd_shuffle4!(a, undefined, [0, 1, 2, 3]),
+        _ => simd_shuffle4!(a, undefined, [4, 5, 6, 7]),
    };
    transmute(extract)
 }
@@ -22131,7 +22131,7 @@ pub unsafe fn _mm256_maskz_extracti32x4_epi32<const IMM1: i32>(k: __mmask8, a: _
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vmovsldup))]
 pub unsafe fn _mm512_moveldup_ps(a: __m512) -> __m512 {
-    let r: f32x16 = simd_shuffle16(a, a, [0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14]);
+    let r: f32x16 = simd_shuffle16!(a, a, [0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14]);
    transmute(r)
 }

@@ -22142,7 +22142,7 @@ pub unsafe fn _mm512_moveldup_ps(a: __m512) -> __m512 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vmovsldup))]
 pub unsafe fn _mm512_mask_moveldup_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
-    let mov: f32x16 = simd_shuffle16(a, a, [0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14]);
+    let mov: f32x16 = simd_shuffle16!(a, a, [0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14]);
    transmute(simd_select_bitmask(k, mov, src.as_f32x16()))
 }

@@ -22153,7 +22153,7 @@ pub unsafe fn _mm512_mask_moveldup_ps(src: __m512, k: __mmask16, a: __m512) -> _
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vmovsldup))]
 pub unsafe fn _mm512_maskz_moveldup_ps(k: __mmask16, a: __m512) -> __m512 {
-    let mov: f32x16 = simd_shuffle16(a, a, [0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14]);
+    let mov: f32x16 = simd_shuffle16!(a, a, [0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14]);
    let zero = _mm512_setzero_ps().as_f32x16();
    transmute(simd_select_bitmask(k, mov, zero))
 }
@@ -22211,7 +22211,7 @@ pub unsafe fn _mm_maskz_moveldup_ps(k: __mmask8, a: __m128) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vmovshdup))]
 pub unsafe fn _mm512_movehdup_ps(a: __m512) -> __m512 {
-    let r: f32x16 = simd_shuffle16(a, a, [1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15]);
+    let r: f32x16 = simd_shuffle16!(a, a, [1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15]);
    transmute(r)
 }

@@ -22222,7 +22222,7 @@ pub unsafe fn _mm512_movehdup_ps(a: __m512) -> __m512 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vmovshdup))]
 pub unsafe fn _mm512_mask_movehdup_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
-    let mov: f32x16 = simd_shuffle16(a, a, [1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15]);
+    let mov: f32x16 = simd_shuffle16!(a, a, [1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15]);
    transmute(simd_select_bitmask(k, mov, src.as_f32x16()))
 }

@@ -22233,7 +22233,7 @@ pub unsafe fn _mm512_mask_movehdup_ps(src: __m512, k: __mmask16, a: __m512) -> _
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vmovshdup))]
 pub unsafe fn _mm512_maskz_movehdup_ps(k: __mmask16, a: __m512) -> __m512 {
-    let mov: f32x16 = simd_shuffle16(a, a, [1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15]);
+    let mov: f32x16 = simd_shuffle16!(a, a, [1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15]);
    let zero = _mm512_setzero_ps().as_f32x16();
    transmute(simd_select_bitmask(k, mov, zero))
 }
@@ -22291,7 +22291,7 @@ pub unsafe fn _mm_maskz_movehdup_ps(k: __mmask8, a: __m128) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vmovddup))]
 pub unsafe fn _mm512_movedup_pd(a: __m512d) -> __m512d {
-    let r: f64x8 = simd_shuffle8(a, a, [0, 0, 2, 2, 4, 4, 6, 6]);
+    let r: f64x8 = simd_shuffle8!(a, a, [0, 0, 2, 2, 4, 4, 6, 6]);
    transmute(r)
 }

@@ -22302,7 +22302,7 @@ pub unsafe fn _mm512_movedup_pd(a: __m512d) -> __m512d {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vmovddup))]
 pub unsafe fn _mm512_mask_movedup_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
-    let mov: f64x8 = simd_shuffle8(a, a, [0, 0, 2, 2, 4, 4, 6, 6]);
+    let mov: f64x8 = simd_shuffle8!(a, a, [0, 0, 2, 2, 4, 4, 6, 6]);
    transmute(simd_select_bitmask(k, mov, src.as_f64x8()))
 }

@@ -22313,7 +22313,7 @@ pub unsafe fn _mm512_mask_movedup_pd(src: __m512d, k: __mmask8, a: __m512d) -> _
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vmovddup))]
 pub unsafe fn _mm512_maskz_movedup_pd(k: __mmask8, a: __m512d) -> __m512d {
-    let mov: f64x8 = simd_shuffle8(a, a, [0, 0, 2, 2, 4, 4, 6, 6]);
+    let mov: f64x8 = simd_shuffle8!(a, a, [0, 0, 2, 2, 4, 4, 6, 6]);
    let zero = _mm512_setzero_pd().as_f64x8();
    transmute(simd_select_bitmask(k, mov, zero))
 }
@@ -22376,22 +22376,22 @@ pub unsafe fn _mm512_inserti32x4<const IMM8: i32>(a: __m512i, b: __m128i) -> __m
    let a = a.as_i32x16();
    let b = _mm512_castsi128_si512(b).as_i32x16();
    let ret: i32x16 = match IMM8 & 0b11 {
-        0 => simd_shuffle16(
+        0 => simd_shuffle16!(
            a,
            b,
            [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
        ),
-        1 => simd_shuffle16(
+        1 => simd_shuffle16!(
            a,
            b,
            [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15],
        ),
-        2 => simd_shuffle16(
+        2 => simd_shuffle16!(
            a,
            b,
            [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15],
        ),
-        _ => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]),
+        _ => simd_shuffle16!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]),
    };
    transmute(ret)
 }
@@ -22447,8 +22447,8 @@ pub unsafe fn _mm256_inserti32x4<const IMM8: i32>(a: __m256i, b: __m128i) -> __m
    let a = a.as_i32x8();
    let b = _mm256_castsi128_si256(b).as_i32x8();
    let ret: i32x8 = match IMM8 & 0b1 {
-        0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
-        _ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
+        0 => simd_shuffle8!(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
+        _ => simd_shuffle8!(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
    };
    transmute(ret)
 }
@@ -22506,8 +22506,8 @@ pub unsafe fn _mm512_inserti64x4<const IMM8: i32>(a: __m512i, b: __m256i) -> __m
    static_assert_imm1!(IMM8);
    let b = _mm512_castsi256_si512(b);
    match IMM8 & 0b1 {
-        0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
-        _ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
+        0 => simd_shuffle8!(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
+        _ => simd_shuffle8!(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
    }
 }

@@ -22558,22 +22558,22 @@ pub unsafe fn _mm512_insertf32x4<const IMM8: i32>(a: __m512, b: __m128) -> __m51
    static_assert_imm2!(IMM8);
    let b = _mm512_castps128_ps512(b);
    match IMM8 & 0b11 {
-        0 => simd_shuffle16(
+        0 => simd_shuffle16!(
            a,
            b,
            [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
        ),
-        1 => simd_shuffle16(
+        1 => simd_shuffle16!(
            a,
            b,
            [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15],
        ),
-        2 => simd_shuffle16(
+        2 => simd_shuffle16!(
            a,
            b,
            [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15],
        ),
-        _ => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]),
+        _ => simd_shuffle16!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]),
    }
 }

@@ -22627,8 +22627,8 @@ pub unsafe fn _mm256_insertf32x4<const IMM8: i32>(a: __m256, b: __m128) -> __m25
    static_assert_imm1!(IMM8);
    let b = _mm256_castps128_ps256(b);
    match IMM8 & 0b1 {
-        0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
-        _ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
+        0 => simd_shuffle8!(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
+        _ => simd_shuffle8!(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
    }
 }

@@ -22685,8 +22685,8 @@ pub unsafe fn _mm512_insertf64x4<const IMM8: i32>(a: __m512d, b: __m256d) -> __m
    static_assert_imm1!(IMM8);
    let b = _mm512_castpd256_pd512(b);
    match IMM8 & 0b1 {
-        0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
-        _ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
+        0 => simd_shuffle8!(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
+        _ => simd_shuffle8!(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
    }
 }

@@ -22736,7 +22736,7 @@ pub unsafe fn _mm512_unpackhi_epi32(a: __m512i, b: __m512i) -> __m512i {
    let a = a.as_i32x16();
    let b = b.as_i32x16();
    #[rustfmt::skip]
-    let r: i32x16 = simd_shuffle16(
+    let r: i32x16 = simd_shuffle16!(
        a, b,
        [ 2, 18, 3, 19,
          2 + 4, 18 + 4, 3 + 4, 19 + 4,
@@ -22837,7 +22837,7 @@ pub unsafe fn _mm_maskz_unpackhi_epi32(k: __mmask8, a: __m128i, b: __m128i) -> _
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vunpckhpd))] //should be vpunpckhqdq
 pub unsafe fn _mm512_unpackhi_epi64(a: __m512i, b: __m512i) -> __m512i {
-    simd_shuffle8(a, b, [1, 9, 1 + 2, 9 + 2, 1 + 4, 9 + 4, 1 + 6, 9 + 6])
+    simd_shuffle8!(a, b, [1, 9, 1 + 2, 9 + 2, 1 + 4, 9 + 4, 1 + 6, 9 + 6])
 }

 /// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -22932,7 +22932,7 @@ pub unsafe fn _mm_maskz_unpackhi_epi64(k: __mmask8, a: __m128i, b: __m128i) -> _
 #[cfg_attr(test, assert_instr(vunpckhps))]
 pub unsafe fn _mm512_unpackhi_ps(a: __m512, b: __m512) -> __m512 {
    #[rustfmt::skip]
-    simd_shuffle16(
+    simd_shuffle16!(
        a, b,
        [ 2, 18, 3, 19,
          2 + 4, 18 + 4, 3 + 4, 19 + 4,
@@ -23017,7 +23017,7 @@ pub unsafe fn _mm_maskz_unpackhi_ps(k: __mmask8, a: __m128, b: __m128) -> __m128
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vunpckhpd))]
 pub unsafe fn _mm512_unpackhi_pd(a: __m512d, b: __m512d) -> __m512d {
-    simd_shuffle8(a, b, [1, 9, 1 + 2, 9 + 2, 1 + 4, 9 + 4, 1 + 6, 9 + 6])
+    simd_shuffle8!(a, b, [1, 9, 1 + 2, 9 + 2, 1 + 4, 9 + 4, 1 + 6, 9 + 6])
 }

 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -23109,7 +23109,7 @@ pub unsafe fn _mm512_unpacklo_epi32(a: __m512i, b: __m512i) -> __m512i {
    let a = a.as_i32x16();
    let b = b.as_i32x16();
    #[rustfmt::skip]
-    let r: i32x16 = simd_shuffle16(
+    let r: i32x16 = simd_shuffle16!(
        a, b,
        [ 0, 16, 1, 17,
          0 + 4, 16 + 4, 1 + 4, 17 + 4,
@@ -23210,7 +23210,7 @@ pub unsafe fn _mm_maskz_unpacklo_epi32(k: __mmask8, a: __m128i, b: __m128i) -> _
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vunpcklpd))] //should be vpunpcklqdq
 pub unsafe fn _mm512_unpacklo_epi64(a: __m512i, b: __m512i) -> __m512i {
-    simd_shuffle8(a, b, [0, 8, 0 + 2, 8 + 2, 0 + 4, 8 + 4, 0 + 6, 8 + 6])
+    simd_shuffle8!(a, b, [0, 8, 0 + 2, 8 + 2, 0 + 4, 8 + 4, 0 + 6, 8 + 6])
 }

 /// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -23305,7 +23305,7 @@ pub unsafe fn _mm_maskz_unpacklo_epi64(k: __mmask8, a: __m128i, b: __m128i) -> _
 #[cfg_attr(test, assert_instr(vunpcklps))]
 pub unsafe fn _mm512_unpacklo_ps(a: __m512, b: __m512) -> __m512 {
    #[rustfmt::skip]
-    simd_shuffle16(a, b,
+    simd_shuffle16!(a, b,
                   [ 0, 16, 1, 17,
                     0 + 4, 16 + 4, 1 + 4, 17 + 4,
                     0 + 8, 16 + 8, 1 + 8, 17 + 8,
@@ -23389,7 +23389,7 @@ pub unsafe fn _mm_maskz_unpacklo_ps(k: __mmask8, a: __m128, b: __m128) -> __m128
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vunpcklpd))]
 pub unsafe fn _mm512_unpacklo_pd(a: __m512d, b: __m512d) -> __m512d {
-    simd_shuffle8(a, b, [0, 8, 0 + 2, 8 + 2, 0 + 4, 8 + 4, 0 + 6, 8 + 6])
+    simd_shuffle8!(a, b, [0, 8, 0 + 2, 8 + 2, 0 + 4, 8 + 4, 0 + 6, 8 + 6])
 }

 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -23477,7 +23477,7 @@ pub unsafe fn _mm_maskz_unpacklo_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m1
 #[inline]
 #[target_feature(enable = "avx512f")]
 pub unsafe fn _mm512_castps128_ps512(a: __m128) -> __m512 {
-    simd_shuffle16(
+    simd_shuffle16!(
        a,
        _mm_set1_ps(-1.),
        [0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4],
@@ -23490,7 +23490,7 @@ pub unsafe fn _mm512_castps128_ps512(a: __m128) -> __m512 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 pub unsafe fn _mm512_castps256_ps512(a: __m256) -> __m512 {
-    simd_shuffle16(
+    simd_shuffle16!(
        a,
        _mm256_set1_ps(-1.),
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8],
@@ -23503,7 +23503,7 @@ pub unsafe fn _mm512_castps256_ps512(a: __m256) -> __m512 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 pub unsafe fn _mm512_zextps128_ps512(a: __m128) -> __m512 {
-    simd_shuffle16(
+    simd_shuffle16!(
        a,
        _mm_set1_ps(0.),
        [0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4],
@@ -23516,7 +23516,7 @@ pub unsafe fn _mm512_zextps128_ps512(a: __m128) -> __m512 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 pub unsafe fn _mm512_zextps256_ps512(a: __m256) -> __m512 {
-    simd_shuffle16(
+    simd_shuffle16!(
        a,
        _mm256_set1_ps(0.),
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8],
@@ -23529,7 +23529,7 @@ pub unsafe fn _mm512_zextps256_ps512(a: __m256) -> __m512 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 pub unsafe fn _mm512_castps512_ps128(a: __m512) -> __m128 {
-    simd_shuffle4(a, a, [0, 1, 2, 3])
+    simd_shuffle4!(a, a, [0, 1, 2, 3])
 }

 /// Cast vector of type __m512 to type __m256. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@@ -23538,7 +23538,7 @@ pub unsafe fn _mm512_castps512_ps128(a: __m512) -> __m128 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 pub unsafe fn _mm512_castps512_ps256(a: __m512) -> __m256 {
-    simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7])
+    simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7])
 }

 /// Cast vector of type __m512 to type __m512d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@@ -23565,7 +23565,7 @@ pub unsafe fn _mm512_castps_si512(a: __m512) -> __m512i {
 #[inline]
 #[target_feature(enable = "avx512f")]
 pub unsafe fn _mm512_castpd128_pd512(a: __m128d) -> __m512d {
-    simd_shuffle8(a, _mm_set1_pd(-1.), [0, 1, 2, 2, 2, 2, 2, 2])
+    simd_shuffle8!(a, _mm_set1_pd(-1.), [0, 1, 2, 2, 2, 2, 2, 2])
 }

 /// Cast vector of type __m256d to type __m512d; the upper 256 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@@ -23574,7 +23574,7 @@ pub unsafe fn _mm512_castpd128_pd512(a: __m128d) -> __m512d {
 #[inline]
 #[target_feature(enable = "avx512f")]
 pub unsafe fn _mm512_castpd256_pd512(a: __m256d) -> __m512d {
-    simd_shuffle8(a, _mm256_set1_pd(-1.), [0, 1, 2, 3, 4, 4, 4, 4])
+    simd_shuffle8!(a, _mm256_set1_pd(-1.), [0, 1, 2, 3, 4, 4, 4, 4])
 }

 /// Cast vector of type __m128d to type __m512d; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@@ -23583,7 +23583,7 @@ pub unsafe fn _mm512_castpd256_pd512(a: __m256d) -> __m512d {
 #[inline]
 #[target_feature(enable = "avx512f")]
 pub unsafe fn _mm512_zextpd128_pd512(a: __m128d) -> __m512d {
-    simd_shuffle8(a, _mm_set1_pd(0.), [0, 1, 2, 2, 2, 2, 2, 2])
+    simd_shuffle8!(a, _mm_set1_pd(0.), [0, 1, 2, 2, 2, 2, 2, 2])
 }

 /// Cast vector of type __m256d to type __m512d; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@@ -23592,7 +23592,7 @@ pub unsafe fn _mm512_zextpd128_pd512(a: __m128d) -> __m512d {
 #[inline]
 #[target_feature(enable = "avx512f")]
 pub unsafe fn _mm512_zextpd256_pd512(a: __m256d) -> __m512d {
-    simd_shuffle8(a, _mm256_set1_pd(0.), [0, 1, 2, 3, 4, 4, 4, 4])
+    simd_shuffle8!(a, _mm256_set1_pd(0.), [0, 1, 2, 3, 4, 4, 4, 4])
 }

 /// Cast vector of type __m512d to type __m128d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@@ -23601,7 +23601,7 @@ pub unsafe fn _mm512_zextpd256_pd512(a: __m256d) -> __m512d {
 #[inline]
 #[target_feature(enable = "avx512f")]
 pub unsafe fn _mm512_castpd512_pd128(a: __m512d) -> __m128d {
-    simd_shuffle2(a, a, [0, 1])
+    simd_shuffle2!(a, a, [0, 1])
 }

 /// Cast vector of type __m512d to type __m256d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@@ -23610,7 +23610,7 @@ pub unsafe fn _mm512_castpd512_pd128(a: __m512d) -> __m128d {
 #[inline]
 #[target_feature(enable = "avx512f")]
 pub unsafe fn _mm512_castpd512_pd256(a: __m512d) -> __m256d {
-    simd_shuffle4(a, a, [0, 1, 2, 3])
+    simd_shuffle4!(a, a, [0, 1, 2, 3])
 }

 /// Cast vector of type __m512d to type __m512. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@@ -23637,7 +23637,7 @@ pub unsafe fn _mm512_castpd_si512(a: __m512d) -> __m512i {
 #[inline]
 #[target_feature(enable = "avx512f")]
 pub unsafe fn _mm512_castsi128_si512(a: __m128i) -> __m512i {
-    simd_shuffle8(a, _mm_set1_epi64x(-1), [0, 1, 2, 2, 2, 2, 2, 2])
+    simd_shuffle8!(a, _mm_set1_epi64x(-1), [0, 1, 2, 2, 2, 2, 2, 2])
 }

 /// Cast vector of type __m256i to type __m512i; the upper 256 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@@ -23646,7 +23646,7 @@ pub unsafe fn _mm512_castsi128_si512(a: __m128i) -> __m512i {
 #[inline]
 #[target_feature(enable = "avx512f")]
 pub unsafe fn _mm512_castsi256_si512(a: __m256i) -> __m512i {
-    simd_shuffle8(a, _mm256_set1_epi64x(-1), [0, 1, 2, 3, 4, 4, 4, 4])
+    simd_shuffle8!(a, _mm256_set1_epi64x(-1), [0, 1, 2, 3, 4, 4, 4, 4])
 }

 /// Cast vector of type __m128i to type __m512i; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@@ -23655,7 +23655,7 @@ pub unsafe fn _mm512_castsi256_si512(a: __m256i) -> __m512i {
 #[inline]
 #[target_feature(enable = "avx512f")]
 pub unsafe fn _mm512_zextsi128_si512(a: __m128i) -> __m512i {
-    simd_shuffle8(a, _mm_set1_epi64x(0), [0, 1, 2, 2, 2, 2, 2, 2])
+    simd_shuffle8!(a, _mm_set1_epi64x(0), [0, 1, 2, 2, 2, 2, 2, 2])
 }

 /// Cast vector of type __m256i to type __m512i; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@@ -23664,7 +23664,7 @@ pub unsafe fn _mm512_zextsi128_si512(a: __m128i) -> __m512i {
 #[inline]
 #[target_feature(enable = "avx512f")]
 pub unsafe fn _mm512_zextsi256_si512(a: __m256i) -> __m512i {
-    simd_shuffle8(a, _mm256_set1_epi64x(0), [0, 1, 2, 3, 4, 4, 4, 4])
+    simd_shuffle8!(a, _mm256_set1_epi64x(0), [0, 1, 2, 3, 4, 4, 4, 4])
 }

 /// Cast vector of type __m512i to type __m128i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@@ -23673,7 +23673,7 @@ pub unsafe fn _mm512_zextsi256_si512(a: __m256i) -> __m512i {
 #[inline]
 #[target_feature(enable = "avx512f")]
 pub unsafe fn _mm512_castsi512_si128(a: __m512i) -> __m128i {
-    simd_shuffle2(a, a, [0, 1])
+    simd_shuffle2!(a, a, [0, 1])
 }

 /// Cast vector of type __m512i to type __m256i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@@ -23682,7 +23682,7 @@ pub unsafe fn _mm512_castsi512_si128(a: __m512i) -> __m128i {
 #[inline]
 #[target_feature(enable = "avx512f")]
 pub unsafe fn _mm512_castsi512_si256(a: __m512i) -> __m256i {
-    simd_shuffle4(a, a, [0, 1, 2, 3])
+    simd_shuffle4!(a, a, [0, 1, 2, 3])
 }

 /// Cast vector of type __m512i to type __m512. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@@ -23722,7 +23722,7 @@ pub unsafe fn _mm512_cvtsi512_si32(a: __m512i) -> i32 {
 #[cfg_attr(test, assert_instr(vbroadcast))] //should be vpbroadcastd
 pub unsafe fn _mm512_broadcastd_epi32(a: __m128i) -> __m512i {
    let a = _mm512_castsi128_si512(a).as_i32x16();
-    let ret: i32x16 = simd_shuffle16(a, a, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
+    let ret: i32x16 = simd_shuffle16!(a, a, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
    transmute(ret)
 }

@@ -23802,7 +23802,7 @@ pub unsafe fn _mm_maskz_broadcastd_epi32(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vbroadcas))] //should be vpbroadcastq
 pub unsafe fn _mm512_broadcastq_epi64(a: __m128i) -> __m512i {
-    simd_shuffle8(a, a, [0, 0, 0, 0, 0, 0, 0, 0])
+    simd_shuffle8!(a, a, [0, 0, 0, 0, 0, 0, 0, 0])
 }

 /// Broadcast the low packed 64-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -23881,7 +23881,7 @@ pub unsafe fn _mm_maskz_broadcastq_epi64(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vbroadcastss))]
 pub unsafe fn _mm512_broadcastss_ps(a: __m128) -> __m512 {
-    simd_shuffle16(a, a, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+    simd_shuffle16!(a, a, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
 }

 /// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -23960,7 +23960,7 @@ pub unsafe fn _mm_maskz_broadcastss_ps(k: __mmask8, a: __m128) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vbroadcastsd))]
 pub unsafe fn _mm512_broadcastsd_pd(a: __m128d) -> __m512d {
-    simd_shuffle8(a, a, [0, 0, 0, 0, 0, 0, 0, 0])
+    simd_shuffle8!(a, a, [0, 0, 0, 0, 0, 0, 0, 0])
 }

 /// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -24016,7 +24016,7 @@ pub unsafe fn _mm256_maskz_broadcastsd_pd(k: __mmask8, a: __m128d) -> __m256d {
 #[target_feature(enable = "avx512f")] //msvc: vbroadcasti32x4, linux: vshuf
 pub unsafe fn _mm512_broadcast_i32x4(a: __m128i) -> __m512i {
    let a = a.as_i32x4();
-    let ret: i32x16 = simd_shuffle16(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]);
+    let ret: i32x16 = simd_shuffle16!(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]);
    transmute(ret)
 }

@@ -24048,7 +24048,7 @@ pub unsafe fn _mm512_maskz_broadcast_i32x4(k: __mmask16, a: __m128i) -> __m512i
 #[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcasti32x4, linux: vshuf
 pub unsafe fn _mm256_broadcast_i32x4(a: __m128i) -> __m256i {
    let a = a.as_i32x4();
-    let ret: i32x8 = simd_shuffle8(a, a, [0, 1, 2, 3, 0, 1, 2, 3]);
+    let ret: i32x8 = simd_shuffle8!(a, a, [0, 1, 2, 3, 0, 1, 2, 3]);
    transmute(ret)
 }

@@ -24079,7 +24079,7 @@ pub unsafe fn _mm256_maskz_broadcast_i32x4(k: __mmask8, a: __m128i) -> __m256i {
 #[inline]
 #[target_feature(enable = "avx512f")] //msvc: vbroadcasti64x4, linux: vperm
 pub unsafe fn _mm512_broadcast_i64x4(a: __m256i) -> __m512i {
-    simd_shuffle8(a, a, [0, 1, 2, 3, 0, 1, 2, 3])
+    simd_shuffle8!(a, a, [0, 1, 2, 3, 0, 1, 2, 3])
 }

 /// Broadcast the 4 packed 64-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -24109,7 +24109,7 @@ pub unsafe fn _mm512_maskz_broadcast_i64x4(k: __mmask8, a: __m256i) -> __m512i {
 #[inline]
 #[target_feature(enable = "avx512f")] //msvc: vbroadcastf32x4, linux: vshuf
 pub unsafe fn _mm512_broadcast_f32x4(a: __m128) -> __m512 {
-    simd_shuffle16(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3])
+    simd_shuffle16!(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3])
 }

 /// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -24139,7 +24139,7 @@ pub unsafe fn _mm512_maskz_broadcast_f32x4(k: __mmask16, a: __m128) -> __m512 {
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcastf32x4, linux: vshuf
 pub unsafe fn _mm256_broadcast_f32x4(a: __m128) -> __m256 {
-    simd_shuffle8(a, a, [0, 1, 2, 3, 0, 1, 2, 3])
+    simd_shuffle8!(a, a, [0, 1, 2, 3, 0, 1, 2, 3])
 }

 /// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -24169,7 +24169,7 @@ pub unsafe fn _mm256_maskz_broadcast_f32x4(k: __mmask8, a: __m128) -> __m256 {
 #[inline]
 #[target_feature(enable = "avx512f")] //msvc: vbroadcastf64x4, linux: vperm
 pub unsafe fn _mm512_broadcast_f64x4(a: __m256d) -> __m512d {
-    simd_shuffle8(a, a, [0, 1, 2, 3, 0, 1, 2, 3])
+    simd_shuffle8!(a, a, [0, 1, 2, 3, 0, 1, 2, 3])
 }

 /// Broadcast the 4 packed double-precision (64-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -24326,66 +24326,62 @@ pub unsafe fn _mm512_alignr_epi32<const IMM8: i32>(a: __m512i, b: __m512i) -> __
    let b = b.as_i32x16();
    let imm8: i32 = IMM8 % 16;
    let r: i32x16 = match imm8 {
-        0 => simd_shuffle16(
+        0 => simd_shuffle16!(
            a,
            b,
-            [
-                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-            ],
+            [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,],
        ),
-        1 => simd_shuffle16(
+        1 => simd_shuffle16!(
            a,
            b,
-            [
-                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0,
-            ],
+            [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0,],
        ),
-        2 => simd_shuffle16(
+        2 => simd_shuffle16!(
            a,
            b,
            [18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1],
        ),
-        3 => simd_shuffle16(
+        3 => simd_shuffle16!(
            a,
            b,
            [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2],
        ),
-        4 => simd_shuffle16(
+        4 => simd_shuffle16!(
            a,
            b,
            [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3],
        ),
-        5 => simd_shuffle16(
+        5 => simd_shuffle16!(
            a,
            b,
            [21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4],
        ),
-        6 => simd_shuffle16(
+        6 => simd_shuffle16!(
            a,
            b,
            [22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5],
        ),
-        7 => simd_shuffle16(
+        7 => simd_shuffle16!(
            a,
            b,
            [23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6],
        ),
-        8 => simd_shuffle16(
+        8 => simd_shuffle16!(
            a,
            b,
            [24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7],
        ),
-        9 => simd_shuffle16(
+        9 => simd_shuffle16!(
            a,
            b,
            [25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8],
        ),
-        10 => simd_shuffle16(a, b, [26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
-        11 => simd_shuffle16(a, b, [27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
-        12 => simd_shuffle16(a, b, [28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
-        13 => simd_shuffle16(a, b, [29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]),
-        14 => simd_shuffle16(a, b, [30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]),
-        _ => simd_shuffle16(a, b, [31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
+        10 => simd_shuffle16!(a, b, [26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
+        11 => simd_shuffle16!(a, b, [27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
+        12 => simd_shuffle16!(a, b, [28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
+        13 => simd_shuffle16!(a, b, [29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]),
+        14 => simd_shuffle16!(a, b, [30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]),
+        _ => simd_shuffle16!(a, b, [31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
    };
    transmute(r)
 }
@@ -24439,22 +24435,22 @@ pub unsafe fn _mm256_alignr_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __
    let b = b.as_i32x8();
    let imm8: i32 = IMM8 % 16;
    let r: i32x8 = match imm8 {
-        0 => simd_shuffle8(a, b, [8, 9, 10, 11, 12, 13, 14, 15]),
-        1 => simd_shuffle8(a, b, [9, 10, 11, 12, 13, 14, 15, 0]),
-        2 => simd_shuffle8(a, b, [10, 11, 12, 13, 14, 15, 0, 1]),
-        3 => simd_shuffle8(a, b, [11, 12, 13, 14, 15, 0, 1, 2]),
-        4 => simd_shuffle8(a, b, [12, 13, 14, 15, 0, 1, 2, 3]),
-        5 => simd_shuffle8(a, b, [13, 14, 15, 0, 1, 2, 3, 4]),
-        6 => simd_shuffle8(a, b, [14, 15, 0, 1, 2, 3, 4, 5]),
-        7 => simd_shuffle8(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
-        8 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 5, 6, 7]),
-        9 => simd_shuffle8(a, b, [1, 2, 3, 4, 5, 6, 7, 8]),
-        10 => simd_shuffle8(a, b, [2, 3, 4, 5, 6, 7, 8, 9]),
-        11 => simd_shuffle8(a, b, [3, 4, 5, 6, 7, 8, 9, 10]),
-        12 => simd_shuffle8(a, b, [4, 5, 6, 7, 8, 9, 10, 11]),
-        13 => simd_shuffle8(a, b, [5, 6, 7, 8, 9, 10, 11, 12]),
-        14 => simd_shuffle8(a, b, [6, 7, 8, 9, 10, 11, 12, 13]),
-        _ => simd_shuffle8(a, b, [7, 8, 9, 10, 11, 12, 13, 14]),
+        0 => simd_shuffle8!(a, b, [8, 9, 10, 11, 12, 13, 14, 15]),
+        1 => simd_shuffle8!(a, b, [9, 10, 11, 12, 13, 14, 15, 0]),
+        2 => simd_shuffle8!(a, b, [10, 11, 12, 13, 14, 15, 0, 1]),
+        3 => simd_shuffle8!(a, b, [11, 12, 13, 14, 15, 0, 1, 2]),
+        4 => simd_shuffle8!(a, b, [12, 13, 14, 15, 0, 1, 2, 3]),
+        5 => simd_shuffle8!(a, b, [13, 14, 15, 0, 1, 2, 3, 4]),
+        6 => simd_shuffle8!(a, b, [14, 15, 0, 1, 2, 3, 4, 5]),
+        7 => simd_shuffle8!(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
+        8 => simd_shuffle8!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]),
+        9 => simd_shuffle8!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]),
+        10 => simd_shuffle8!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]),
+        11 => simd_shuffle8!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]),
+        12 => simd_shuffle8!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]),
+        13 => simd_shuffle8!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]),
+        14 => simd_shuffle8!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]),
+        _ => simd_shuffle8!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]),
    };
    transmute(r)
 }
@@ -24508,14 +24504,14 @@ pub unsafe fn _mm_alignr_epi32<const IMM8: i32>(a: __m128i, b: __m128i) -> __m12
    let b = b.as_i32x4();
    let imm8: i32 = IMM8 % 8;
    let r: i32x4 = match imm8 {
-        0 => simd_shuffle4(a, b, [4, 5, 6, 7]),
-        1 => simd_shuffle4(a, b, [5, 6, 7, 0]),
-        2 => simd_shuffle4(a, b, [6, 7, 0, 1]),
-        3 => simd_shuffle4(a, b, [7, 0, 1, 2]),
-        4 => simd_shuffle4(a, b, [0, 1, 2, 3]),
-        5 => simd_shuffle4(a, b, [1, 2, 3, 0]),
-        6 => simd_shuffle4(a, b, [2, 3, 0, 1]),
-        _ => simd_shuffle4(a, b, [3, 0, 1, 2]),
+        0 => simd_shuffle4!(a, b, [4, 5, 6, 7]),
+        1 => simd_shuffle4!(a, b, [5, 6, 7, 0]),
+        2 => simd_shuffle4!(a, b, [6, 7, 0, 1]),
+        3 => simd_shuffle4!(a, b, [7, 0, 1, 2]),
+        4 => simd_shuffle4!(a, b, [0, 1, 2, 3]),
+        5 => simd_shuffle4!(a, b, [1, 2, 3, 0]),
+        6 => simd_shuffle4!(a, b, [2, 3, 0, 1]),
+        _ => simd_shuffle4!(a, b, [3, 0, 1, 2]),
    };
    transmute(r)
 }
@@ -24567,14 +24563,14 @@ pub unsafe fn _mm512_alignr_epi64<const IMM8: i32>(a: __m512i, b: __m512i) -> __
    static_assert_imm8!(IMM8);
    let imm8: i32 = IMM8 % 8;
    let r: i64x8 = match imm8 {
-        0 => simd_shuffle8(a, b, [8, 9, 10, 11, 12, 13, 14, 15]),
-        1 => simd_shuffle8(a, b, [9, 10, 11, 12, 13, 14, 15, 0]),
-        2 => simd_shuffle8(a, b, [10, 11, 12, 13, 14, 15, 0, 1]),
-        3 => simd_shuffle8(a, b, [11, 12, 13, 14, 15, 0, 1, 2]),
-        4 => simd_shuffle8(a, b, [12, 13, 14, 15, 0, 1, 2, 3]),
-        5 => simd_shuffle8(a, b, [13, 14, 15, 0, 1, 2, 3, 4]),
-        6 => simd_shuffle8(a, b, [14, 15, 0, 1, 2, 3, 4, 5]),
-        _ => simd_shuffle8(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
+        0 => simd_shuffle8!(a, b, [8, 9, 10, 11, 12, 13, 14, 15]),
+        1 => simd_shuffle8!(a, b, [9, 10, 11, 12, 13, 14, 15, 0]),
+        2 => simd_shuffle8!(a, b, [10, 11, 12, 13, 14, 15, 0, 1]),
+        3 => simd_shuffle8!(a, b, [11, 12, 13, 14, 15, 0, 1, 2]),
+        4 => simd_shuffle8!(a, b, [12, 13, 14, 15, 0, 1, 2, 3]),
+        5 => simd_shuffle8!(a, b, [13, 14, 15, 0, 1, 2, 3, 4]),
+        6 => simd_shuffle8!(a, b, [14, 15, 0, 1, 2, 3, 4, 5]),
+        _ => simd_shuffle8!(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
    };
    transmute(r)
 }
@@ -24626,14 +24622,14 @@ pub unsafe fn _mm256_alignr_epi64<const IMM8: i32>(a: __m256i, b: __m256i) -> __
    static_assert_imm8!(IMM8);
    let imm8: i32 = IMM8 % 8;
    let r: i64x4 = match imm8 {
-        0 => simd_shuffle4(a, b, [4, 5, 6, 7]),
-        1 => simd_shuffle4(a, b, [5, 6, 7, 0]),
-        2 => simd_shuffle4(a, b, [6, 7, 0, 1]),
-        3 => simd_shuffle4(a, b, [7, 0, 1, 2]),
-        4 => simd_shuffle4(a, b, [0, 1, 2, 3]),
-        5 => simd_shuffle4(a, b, [1, 2, 3, 4]),
-        6 => simd_shuffle4(a, b, [2, 3, 4, 5]),
-        _ => simd_shuffle4(a, b, [3, 4, 5, 6]),
+        0 => simd_shuffle4!(a, b, [4, 5, 6, 7]),
+        1 => simd_shuffle4!(a, b, [5, 6, 7, 0]),
+        2 => simd_shuffle4!(a, b, [6, 7, 0, 1]),
+        3 => simd_shuffle4!(a, b, [7, 0, 1, 2]),
+        4 => simd_shuffle4!(a, b, [0, 1, 2, 3]),
+        5 => simd_shuffle4!(a, b, [1, 2, 3, 4]),
+        6 => simd_shuffle4!(a, b, [2, 3, 4, 5]),
+        _ => simd_shuffle4!(a, b, [3, 4, 5, 6]),
    };
    transmute(r)
 }
@@ -24685,10 +24681,10 @@ pub unsafe fn _mm_alignr_epi64<const IMM8: i32>(a: __m128i, b: __m128i) -> __m12
    static_assert_imm8!(IMM8);
    let imm8: i32 = IMM8 % 4;
    let r: i64x2 = match imm8 {
-        0 => simd_shuffle2(a, b, [2, 3]),
-        1 => simd_shuffle2(a, b, [3, 0]),
-        2 => simd_shuffle2(a, b, [0, 1]),
-        _ => simd_shuffle2(a, b, [1, 2]),
+        0 => simd_shuffle2!(a, b, [2, 3]),
+        1 => simd_shuffle2!(a, b, [3, 0]),
+        2 => simd_shuffle2!(a, b, [0, 1]),
+        _ => simd_shuffle2!(a, b, [1, 2]),
    };
    transmute(r)
 }
@@ -350,7 +350,7 @@ pub unsafe fn _mm_cmple_ss(a: __m128, b: __m128) -> __m128 {
 #[cfg_attr(test, assert_instr(cmpltss))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpgt_ss(a: __m128, b: __m128) -> __m128 {
-    simd_shuffle4(a, cmpss(b, a, 1), [4, 1, 2, 3])
+    simd_shuffle4!(a, cmpss(b, a, 1), [4, 1, 2, 3])
 }

 /// Compares the lowest `f32` of both inputs for greater than or equal. The
@@ -364,7 +364,7 @@ pub unsafe fn _mm_cmpgt_ss(a: __m128, b: __m128) -> __m128 {
 #[cfg_attr(test, assert_instr(cmpless))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpge_ss(a: __m128, b: __m128) -> __m128 {
-    simd_shuffle4(a, cmpss(b, a, 2), [4, 1, 2, 3])
+    simd_shuffle4!(a, cmpss(b, a, 2), [4, 1, 2, 3])
 }

 /// Compares the lowest `f32` of both inputs for inequality. The lowest 32 bits
@@ -420,7 +420,7 @@ pub unsafe fn _mm_cmpnle_ss(a: __m128, b: __m128) -> __m128 {
 #[cfg_attr(test, assert_instr(cmpnltss))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpngt_ss(a: __m128, b: __m128) -> __m128 {
-    simd_shuffle4(a, cmpss(b, a, 5), [4, 1, 2, 3])
+    simd_shuffle4!(a, cmpss(b, a, 5), [4, 1, 2, 3])
 }

 /// Compares the lowest `f32` of both inputs for not-greater-than-or-equal. The
@@ -434,7 +434,7 @@ pub unsafe fn _mm_cmpngt_ss(a: __m128, b: __m128) -> __m128 {
 #[cfg_attr(test, assert_instr(cmpnless))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpnge_ss(a: __m128, b: __m128) -> __m128 {
-    simd_shuffle4(a, cmpss(b, a, 6), [4, 1, 2, 3])
+    simd_shuffle4!(a, cmpss(b, a, 6), [4, 1, 2, 3])
 }

 /// Checks if the lowest `f32` of both inputs are ordered. The lowest 32 bits of
@@ -1011,10 +1011,10 @@ pub const fn _MM_SHUFFLE(z: u32, y: u32, x: u32, w: u32) -> i32 {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_shuffle_ps<const MASK: i32>(a: __m128, b: __m128) -> __m128 {
    static_assert_imm8!(MASK);
-    simd_shuffle4(
+    simd_shuffle4!(
        a,
        b,
-        [
+        <const MASK: i32> [
            MASK as u32 & 0b11,
            (MASK as u32 >> 2) & 0b11,
            ((MASK as u32 >> 4) & 0b11) + 4,
@@ -1032,7 +1032,7 @@ pub unsafe fn _mm_shuffle_ps<const MASK: i32>(a: __m128, b: __m128) -> __m128 {
 #[cfg_attr(test, assert_instr(unpckhps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_unpackhi_ps(a: __m128, b: __m128) -> __m128 {
-    simd_shuffle4(a, b, [2, 6, 3, 7])
+    simd_shuffle4!(a, b, [2, 6, 3, 7])
 }

 /// Unpacks and interleave single-precision (32-bit) floating-point elements
@@ -1044,7 +1044,7 @@ pub unsafe fn _mm_unpackhi_ps(a: __m128, b: __m128) -> __m128 {
 #[cfg_attr(test, assert_instr(unpcklps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_unpacklo_ps(a: __m128, b: __m128) -> __m128 {
-    simd_shuffle4(a, b, [0, 4, 1, 5])
+    simd_shuffle4!(a, b, [0, 4, 1, 5])
 }

 /// Combine higher half of `a` and `b`. The highwe half of `b` occupies the
@@ -1057,7 +1057,7 @@ pub unsafe fn _mm_unpacklo_ps(a: __m128, b: __m128) -> __m128 {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_movehl_ps(a: __m128, b: __m128) -> __m128 {
    // TODO; figure why this is a different instruction on Windows?
-    simd_shuffle4(a, b, [6, 7, 2, 3])
+    simd_shuffle4!(a, b, [6, 7, 2, 3])
 }

 /// Combine lower half of `a` and `b`. The lower half of `b` occupies the
@@ -1069,7 +1069,7 @@ pub unsafe fn _mm_movehl_ps(a: __m128, b: __m128) -> __m128 {
 #[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movlhps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_movelh_ps(a: __m128, b: __m128) -> __m128 {
-    simd_shuffle4(a, b, [0, 1, 4, 5])
+    simd_shuffle4!(a, b, [0, 1, 4, 5])
 }

 /// Returns a mask of the most significant bit of each element in `a`.
@@ -1201,7 +1201,7 @@ pub unsafe fn _mm_loadu_ps(p: *const f32) -> __m128 {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_loadr_ps(p: *const f32) -> __m128 {
    let a = _mm_load_ps(p);
-    simd_shuffle4(a, a, [3, 2, 1, 0])
+    simd_shuffle4!(a, a, [3, 2, 1, 0])
 }

 /// Loads unaligned 64-bits of integer data from memory into new vector.
@@ -1253,7 +1253,7 @@ pub unsafe fn _mm_store_ss(p: *mut f32, a: __m128) {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 #[allow(clippy::cast_ptr_alignment)]
 pub unsafe fn _mm_store1_ps(p: *mut f32, a: __m128) {
-    let b: __m128 = simd_shuffle4(a, a, [0, 0, 0, 0]);
+    let b: __m128 = simd_shuffle4!(a, a, [0, 0, 0, 0]);
    *(p as *mut __m128) = b;
 }

@@ -1329,7 +1329,7 @@ pub unsafe fn _mm_storeu_ps(p: *mut f32, a: __m128) {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 #[allow(clippy::cast_ptr_alignment)]
 pub unsafe fn _mm_storer_ps(p: *mut f32, a: __m128) {
-    let b: __m128 = simd_shuffle4(a, a, [3, 2, 1, 0]);
+    let b: __m128 = simd_shuffle4!(a, a, [3, 2, 1, 0]);
    *(p as *mut __m128) = b;
 }

@@ -1347,7 +1347,7 @@ pub unsafe fn _mm_storer_ps(p: *mut f32, a: __m128) {
 #[cfg_attr(test, assert_instr(movss))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_move_ss(a: __m128, b: __m128) -> __m128 {
-    simd_shuffle4(a, b, [4, 1, 2, 3])
+    simd_shuffle4!(a, b, [4, 1, 2, 3])
 }

 /// Performs a serializing operation on all store-to-memory instructions that
@@ -432,10 +432,10 @@ const fn mask(shift: i32, i: u32) -> u32 {
        }
    }
    let zero = _mm_set1_epi8(0).as_i8x16();
-    transmute(simd_shuffle16::<i8x16, i8x16>(
+    transmute::<i8x16, _>(simd_shuffle16!(
        zero,
        a.as_i8x16(),
-        [
+        <const IMM8: i32> [
            mask(IMM8, 0),
            mask(IMM8, 1),
            mask(IMM8, 2),
@@ -635,10 +635,10 @@ const fn mask(shift: i32, i: u32) -> u32 {
        }
    }
    let zero = _mm_set1_epi8(0).as_i8x16();
-    let x: i8x16 = simd_shuffle16(
+    let x: i8x16 = simd_shuffle16!(
        a.as_i8x16(),
        zero,
-        [
+        <const IMM8: i32> [
            mask(IMM8, 0),
            mask(IMM8, 1),
            mask(IMM8, 2),
@@ -895,7 +895,7 @@ pub unsafe fn _mm_cmplt_epi32(a: __m128i, b: __m128i) -> __m128i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepi32_pd(a: __m128i) -> __m128d {
    let a = a.as_i32x4();
-    simd_cast::<i32x2, __m128d>(simd_shuffle2(a, a, [0, 1]))
+    simd_cast::<i32x2, __m128d>(simd_shuffle2!(a, a, [0, 1]))
 }

 /// Returns `a` with its lower element replaced by `b` after converting it to
@@ -1303,7 +1303,7 @@ pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_move_epi64(a: __m128i) -> __m128i {
    let zero = _mm_setzero_si128();
-    let r: i64x2 = simd_shuffle2(a.as_i64x2(), zero.as_i64x2(), [0, 2]);
+    let r: i64x2 = simd_shuffle2!(a.as_i64x2(), zero.as_i64x2(), [0, 2]);
    transmute(r)
 }

@@ -1391,10 +1391,10 @@ pub unsafe fn _mm_movemask_epi8(a: __m128i) -> i32 {
 pub unsafe fn _mm_shuffle_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
    static_assert_imm8!(IMM8);
    let a = a.as_i32x4();
-    let x: i32x4 = simd_shuffle4(
+    let x: i32x4 = simd_shuffle4!(
        a,
        a,
-        [
+        <const IMM8: i32> [
            IMM8 as u32 & 0b11,
            (IMM8 as u32 >> 2) & 0b11,
            (IMM8 as u32 >> 4) & 0b11,
@@ -1419,10 +1419,10 @@ pub unsafe fn _mm_shuffle_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
 pub unsafe fn _mm_shufflehi_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
    static_assert_imm8!(IMM8);
    let a = a.as_i16x8();
-    let x: i16x8 = simd_shuffle8(
+    let x: i16x8 = simd_shuffle8!(
        a,
        a,
-        [
+        <const IMM8: i32> [
            0,
            1,
            2,
@@ -1451,10 +1451,10 @@ pub unsafe fn _mm_shufflehi_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
 pub unsafe fn _mm_shufflelo_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
    static_assert_imm8!(IMM8);
    let a = a.as_i16x8();
-    let x: i16x8 = simd_shuffle8(
+    let x: i16x8 = simd_shuffle8!(
        a,
        a,
-        [
+        <const IMM8: i32> [
            IMM8 as u32 & 0b11,
            (IMM8 as u32 >> 2) & 0b11,
            (IMM8 as u32 >> 4) & 0b11,
@@ -1476,7 +1476,7 @@ pub unsafe fn _mm_shufflelo_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(punpckhbw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i {
-    transmute::<i8x16, _>(simd_shuffle16(
+    transmute::<i8x16, _>(simd_shuffle16!(
        a.as_i8x16(),
        b.as_i8x16(),
        [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31],
@@ -1491,7 +1491,7 @@ pub unsafe fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(punpckhwd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i {
-    let x = simd_shuffle8(a.as_i16x8(), b.as_i16x8(), [4, 12, 5, 13, 6, 14, 7, 15]);
+    let x = simd_shuffle8!(a.as_i16x8(), b.as_i16x8(), [4, 12, 5, 13, 6, 14, 7, 15]);
    transmute::<i16x8, _>(x)
 }

@@ -1503,7 +1503,7 @@ pub unsafe fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(unpckhps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i {
-    transmute::<i32x4, _>(simd_shuffle4(a.as_i32x4(), b.as_i32x4(), [2, 6, 3, 7]))
+    transmute::<i32x4, _>(simd_shuffle4!(a.as_i32x4(), b.as_i32x4(), [2, 6, 3, 7]))
 }

 /// Unpacks and interleave 64-bit integers from the high half of `a` and `b`.
@@ -1514,7 +1514,7 @@ pub unsafe fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(unpckhpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_unpackhi_epi64(a: __m128i, b: __m128i) -> __m128i {
-    transmute::<i64x2, _>(simd_shuffle2(a.as_i64x2(), b.as_i64x2(), [1, 3]))
+    transmute::<i64x2, _>(simd_shuffle2!(a.as_i64x2(), b.as_i64x2(), [1, 3]))
 }

 /// Unpacks and interleave 8-bit integers from the low half of `a` and `b`.
@@ -1525,7 +1525,7 @@ pub unsafe fn _mm_unpackhi_epi64(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(punpcklbw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i {
-    transmute::<i8x16, _>(simd_shuffle16(
+    transmute::<i8x16, _>(simd_shuffle16!(
        a.as_i8x16(),
        b.as_i8x16(),
        [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23],
@@ -1540,7 +1540,7 @@ pub unsafe fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(punpcklwd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i {
-    let x = simd_shuffle8(a.as_i16x8(), b.as_i16x8(), [0, 8, 1, 9, 2, 10, 3, 11]);
+    let x = simd_shuffle8!(a.as_i16x8(), b.as_i16x8(), [0, 8, 1, 9, 2, 10, 3, 11]);
    transmute::<i16x8, _>(x)
 }

@@ -1552,7 +1552,7 @@ pub unsafe fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(unpcklps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i {
-    transmute::<i32x4, _>(simd_shuffle4(a.as_i32x4(), b.as_i32x4(), [0, 4, 1, 5]))
+    transmute::<i32x4, _>(simd_shuffle4!(a.as_i32x4(), b.as_i32x4(), [0, 4, 1, 5]))
 }

 /// Unpacks and interleave 64-bit integers from the low half of `a` and `b`.
@@ -1563,7 +1563,7 @@ pub unsafe fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movlhps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
-    transmute::<i64x2, _>(simd_shuffle2(a.as_i64x2(), b.as_i64x2(), [0, 2]))
+    transmute::<i64x2, _>(simd_shuffle2!(a.as_i64x2(), b.as_i64x2(), [0, 2]))
 }

 /// Returns a new vector with the low element of `a` replaced by the sum of the
@@ -2519,7 +2519,7 @@ pub unsafe fn _mm_storeu_pd(mem_addr: *mut f64, a: __m128d) {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 #[allow(clippy::cast_ptr_alignment)]
 pub unsafe fn _mm_store1_pd(mem_addr: *mut f64, a: __m128d) {
-    let b: __m128d = simd_shuffle2(a, a, [0, 0]);
+    let b: __m128d = simd_shuffle2!(a, a, [0, 0]);
    *(mem_addr as *mut __m128d) = b;
 }

@@ -2533,7 +2533,7 @@ pub unsafe fn _mm_store1_pd(mem_addr: *mut f64, a: __m128d) {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 #[allow(clippy::cast_ptr_alignment)]
 pub unsafe fn _mm_store_pd1(mem_addr: *mut f64, a: __m128d) {
-    let b: __m128d = simd_shuffle2(a, a, [0, 0]);
+    let b: __m128d = simd_shuffle2!(a, a, [0, 0]);
    *(mem_addr as *mut __m128d) = b;
 }

@@ -2548,7 +2548,7 @@ pub unsafe fn _mm_store_pd1(mem_addr: *mut f64, a: __m128d) {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 #[allow(clippy::cast_ptr_alignment)]
 pub unsafe fn _mm_storer_pd(mem_addr: *mut f64, a: __m128d) {
-    let b: __m128d = simd_shuffle2(a, a, [1, 0]);
+    let b: __m128d = simd_shuffle2!(a, a, [1, 0]);
    *(mem_addr as *mut __m128d) = b;
 }

@@ -2612,7 +2612,7 @@ pub unsafe fn _mm_load_pd1(mem_addr: *const f64) -> __m128d {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_loadr_pd(mem_addr: *const f64) -> __m128d {
    let a = _mm_load_pd(mem_addr);
-    simd_shuffle2(a, a, [1, 0])
+    simd_shuffle2!(a, a, [1, 0])
 }

 /// Loads 128-bits (composed of 2 packed double-precision (64-bit)
@@ -2653,7 +2653,7 @@ pub unsafe fn _mm_loadu_pd(mem_addr: *const f64) -> __m128d {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_shuffle_pd<const MASK: i32>(a: __m128d, b: __m128d) -> __m128d {
    static_assert_imm8!(MASK);
-    simd_shuffle2(a, b, [MASK as u32 & 0b1, ((MASK as u32 >> 1) & 0b1) + 2])
+    simd_shuffle2!(a, b, <const MASK: i32> [MASK as u32 & 0b1, ((MASK as u32 >> 1) & 0b1) + 2])
 }

 /// Constructs a 128-bit floating-point vector of `[2 x double]`. The lower
@@ -2777,7 +2777,7 @@ pub unsafe fn _mm_undefined_si128() -> __m128i {
 #[cfg_attr(test, assert_instr(unpckhpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_unpackhi_pd(a: __m128d, b: __m128d) -> __m128d {
-    simd_shuffle2(a, b, [1, 3])
+    simd_shuffle2!(a, b, [1, 3])
 }

 /// The resulting `__m128d` element is composed by the high-order values of
@@ -2792,7 +2792,7 @@ pub unsafe fn _mm_unpackhi_pd(a: __m128d, b: __m128d) -> __m128d {
 #[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movlhps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_unpacklo_pd(a: __m128d, b: __m128d) -> __m128d {
-    simd_shuffle2(a, b, [0, 2])
+    simd_shuffle2!(a, b, [0, 2])
 }

 #[allow(improper_ctypes)]
@@ -106,7 +106,7 @@ pub unsafe fn _mm_lddqu_si128(mem_addr: *const __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(movddup))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_movedup_pd(a: __m128d) -> __m128d {
-    simd_shuffle2(a, a, [0, 0])
+    simd_shuffle2!(a, a, [0, 0])
 }

 /// Loads a double-precision (64-bit) floating-point element from memory
@@ -130,7 +130,7 @@ pub unsafe fn _mm_loaddup_pd(mem_addr: *const f64) -> __m128d {
 #[cfg_attr(test, assert_instr(movshdup))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_movehdup_ps(a: __m128) -> __m128 {
-    simd_shuffle4(a, a, [1, 1, 3, 3])
+    simd_shuffle4!(a, a, [1, 1, 3, 3])
 }

 /// Duplicate even-indexed single-precision (32-bit) floating-point elements
@@ -142,7 +142,7 @@ pub unsafe fn _mm_movehdup_ps(a: __m128) -> __m128 {
 #[cfg_attr(test, assert_instr(movsldup))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_moveldup_ps(a: __m128) -> __m128 {
-    simd_shuffle4(a, a, [0, 0, 2, 2])
+    simd_shuffle4!(a, a, [0, 0, 2, 2])
 }

 #[allow(improper_ctypes)]
@@ -379,7 +379,7 @@ pub unsafe fn _mm_cmpeq_epi64(a: __m128i, b: __m128i) -> __m128i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepi8_epi16(a: __m128i) -> __m128i {
    let a = a.as_i8x16();
-    let a = simd_shuffle8::<_, i8x8>(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+    let a: i8x8 = simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
    transmute(simd_cast::<_, i16x8>(a))
 }

@@ -392,7 +392,7 @@ pub unsafe fn _mm_cvtepi8_epi16(a: __m128i) -> __m128i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepi8_epi32(a: __m128i) -> __m128i {
    let a = a.as_i8x16();
-    let a = simd_shuffle4::<_, i8x4>(a, a, [0, 1, 2, 3]);
+    let a: i8x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
    transmute(simd_cast::<_, i32x4>(a))
 }

@@ -406,7 +406,7 @@ pub unsafe fn _mm_cvtepi8_epi32(a: __m128i) -> __m128i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepi8_epi64(a: __m128i) -> __m128i {
    let a = a.as_i8x16();
-    let a = simd_shuffle2::<_, i8x2>(a, a, [0, 1]);
+    let a: i8x2 = simd_shuffle2!(a, a, [0, 1]);
    transmute(simd_cast::<_, i64x2>(a))
 }

@@ -419,7 +419,7 @@ pub unsafe fn _mm_cvtepi8_epi64(a: __m128i) -> __m128i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepi16_epi32(a: __m128i) -> __m128i {
    let a = a.as_i16x8();
-    let a = simd_shuffle4::<_, i16x4>(a, a, [0, 1, 2, 3]);
+    let a: i16x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
    transmute(simd_cast::<_, i32x4>(a))
 }

@@ -432,7 +432,7 @@ pub unsafe fn _mm_cvtepi16_epi32(a: __m128i) -> __m128i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepi16_epi64(a: __m128i) -> __m128i {
    let a = a.as_i16x8();
-    let a = simd_shuffle2::<_, i16x2>(a, a, [0, 1]);
+    let a: i16x2 = simd_shuffle2!(a, a, [0, 1]);
    transmute(simd_cast::<_, i64x2>(a))
 }

@@ -445,7 +445,7 @@ pub unsafe fn _mm_cvtepi16_epi64(a: __m128i) -> __m128i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepi32_epi64(a: __m128i) -> __m128i {
    let a = a.as_i32x4();
-    let a = simd_shuffle2::<_, i32x2>(a, a, [0, 1]);
+    let a: i32x2 = simd_shuffle2!(a, a, [0, 1]);
    transmute(simd_cast::<_, i64x2>(a))
 }

@@ -458,7 +458,7 @@ pub unsafe fn _mm_cvtepi32_epi64(a: __m128i) -> __m128i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepu8_epi16(a: __m128i) -> __m128i {
    let a = a.as_u8x16();
-    let a = simd_shuffle8::<_, u8x8>(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+    let a: u8x8 = simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
    transmute(simd_cast::<_, i16x8>(a))
 }

@@ -471,7 +471,7 @@ pub unsafe fn _mm_cvtepu8_epi16(a: __m128i) -> __m128i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepu8_epi32(a: __m128i) -> __m128i {
    let a = a.as_u8x16();
-    let a = simd_shuffle4::<_, u8x4>(a, a, [0, 1, 2, 3]);
+    let a: u8x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
    transmute(simd_cast::<_, i32x4>(a))
 }

@@ -484,7 +484,7 @@ pub unsafe fn _mm_cvtepu8_epi32(a: __m128i) -> __m128i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepu8_epi64(a: __m128i) -> __m128i {
    let a = a.as_u8x16();
-    let a = simd_shuffle2::<_, u8x2>(a, a, [0, 1]);
+    let a: u8x2 = simd_shuffle2!(a, a, [0, 1]);
    transmute(simd_cast::<_, i64x2>(a))
 }

@@ -498,7 +498,7 @@ pub unsafe fn _mm_cvtepu8_epi64(a: __m128i) -> __m128i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepu16_epi32(a: __m128i) -> __m128i {
    let a = a.as_u16x8();
-    let a = simd_shuffle4::<_, u16x4>(a, a, [0, 1, 2, 3]);
+    let a: u16x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
    transmute(simd_cast::<_, i32x4>(a))
 }

@@ -512,7 +512,7 @@ pub unsafe fn _mm_cvtepu16_epi32(a: __m128i) -> __m128i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepu16_epi64(a: __m128i) -> __m128i {
    let a = a.as_u16x8();
-    let a = simd_shuffle2::<_, u16x2>(a, a, [0, 1]);
+    let a: u16x2 = simd_shuffle2!(a, a, [0, 1]);
    transmute(simd_cast::<_, i64x2>(a))
 }

@@ -526,7 +526,7 @@ pub unsafe fn _mm_cvtepu16_epi64(a: __m128i) -> __m128i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepu32_epi64(a: __m128i) -> __m128i {
    let a = a.as_u32x4();
-    let a = simd_shuffle2::<_, u32x2>(a, a, [0, 1]);
+    let a: u32x2 = simd_shuffle2!(a, a, [0, 1]);
    transmute(simd_cast::<_, i64x2>(a))
 }

@@ -113,10 +113,10 @@ const fn mask(shift: u32, i: u32) -> u32 {
            shift + i
        }
    }
-    let r: i8x16 = simd_shuffle16(
+    let r: i8x16 = simd_shuffle16!(
        b.as_i8x16(),
        a.as_i8x16(),
-        [
+        <const IMM8: i32> [
            mask(IMM8 as u32, 0),
            mask(IMM8 as u32, 1),
            mask(IMM8 as u32, 2),
@@ -194,8 +194,8 @@ generate int32x2_t:int32x2_t:int64x2_t
 /// Unsigned Absolute difference Long
 name = vabdl_high
 no-q
-multi_fn = simd_shuffle8, c:uint8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15]
-multi_fn = simd_shuffle8, d:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle8!, c:uint8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle8!, d:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
 multi_fn = simd_cast, {vabd_u8, c, d}
 a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
 b = 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10
@@ -207,8 +207,8 @@ generate uint8x16_t:uint8x16_t:uint16x8_t
 /// Unsigned Absolute difference Long
 name = vabdl_high
 no-q
-multi_fn = simd_shuffle4, c:uint16x4_t, a, a, [4, 5, 6, 7]
-multi_fn = simd_shuffle4, d:uint16x4_t, b, b, [4, 5, 6, 7]
+multi_fn = simd_shuffle4!, c:uint16x4_t, a, a, [4, 5, 6, 7]
+multi_fn = simd_shuffle4!, d:uint16x4_t, b, b, [4, 5, 6, 7]
 multi_fn = simd_cast, {vabd_u16, c, d}
 a = 1, 2, 3, 4, 8, 9, 11, 12
 b = 10, 10, 10, 10, 10, 10, 10, 10
@@ -220,8 +220,8 @@ generate uint16x8_t:uint16x8_t:uint32x4_t
 /// Unsigned Absolute difference Long
 name = vabdl_high
 no-q
-multi_fn = simd_shuffle2, c:uint32x2_t, a, a, [2, 3]
-multi_fn = simd_shuffle2, d:uint32x2_t, b, b, [2, 3]
+multi_fn = simd_shuffle2!, c:uint32x2_t, a, a, [2, 3]
+multi_fn = simd_shuffle2!, d:uint32x2_t, b, b, [2, 3]
 multi_fn = simd_cast, {vabd_u32, c, d}
 a = 1, 2, 3, 4
 b = 10, 10, 10, 10
@@ -233,8 +233,8 @@ generate uint32x4_t:uint32x4_t:uint64x2_t
 /// Signed Absolute difference Long
 name = vabdl_high
 no-q
-multi_fn = simd_shuffle8, c:int8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15]
-multi_fn = simd_shuffle8, d:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle8!, c:int8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle8!, d:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
 multi_fn = simd_cast, e:uint8x8_t, {vabd_s8, c, d}
 multi_fn = simd_cast, e
 a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
@@ -247,8 +247,8 @@ generate int8x16_t:int8x16_t:int16x8_t
 /// Signed Absolute difference Long
 name = vabdl_high
 no-q
-multi_fn = simd_shuffle4, c:int16x4_t, a, a, [4, 5, 6, 7]
-multi_fn = simd_shuffle4, d:int16x4_t, b, b, [4, 5, 6, 7]
+multi_fn = simd_shuffle4!, c:int16x4_t, a, a, [4, 5, 6, 7]
+multi_fn = simd_shuffle4!, d:int16x4_t, b, b, [4, 5, 6, 7]
 multi_fn = simd_cast, e:uint16x4_t, {vabd_s16, c, d}
 multi_fn = simd_cast, e
 a = 1, 2, 3, 4, 9, 10, 11, 12
@@ -261,8 +261,8 @@ generate int16x8_t:int16x8_t:int32x4_t
 /// Signed Absolute difference Long
 name = vabdl_high
 no-q
-multi_fn = simd_shuffle2, c:int32x2_t, a, a, [2, 3]
-multi_fn = simd_shuffle2, d:int32x2_t, b, b, [2, 3]
+multi_fn = simd_shuffle2!, c:int32x2_t, a, a, [2, 3]
+multi_fn = simd_shuffle2!, d:int32x2_t, b, b, [2, 3]
 multi_fn = simd_cast, e:uint32x2_t, {vabd_s32, c, d}
 multi_fn = simd_cast, e
 a = 1, 2, 3, 4
@@ -727,7 +727,7 @@ lane-suffixes
 constn = LANE1:LANE2
 multi_fn = static_assert_imm-in0_exp_len-LANE1
 multi_fn = static_assert_imm-in_exp_len-LANE2
-multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-noext, a, b, {ins-in0_len-in0_len-LANE2}
+multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2}
 a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
 b = 0, MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 n = 0:1
@@ -744,7 +744,7 @@ lane-suffixes
 constn = LANE1:LANE2
 multi_fn = static_assert_imm-in0_exp_len-LANE1
 multi_fn = static_assert_imm-in_exp_len-LANE2
-multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-noext, a, b, {ins-in0_len-in0_len-LANE2}
+multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2}
 a = 1., 2., 3., 4.
 b = 0., 0.5, 0., 0.
 n = 0:1
@@ -759,8 +759,8 @@ lane-suffixes
 constn = LANE1:LANE2
 multi_fn = static_assert_imm-in0_exp_len-LANE1
 multi_fn = static_assert_imm-in_exp_len-LANE2
-multi_fn = simd_shuffle-in_len-noext, a:in_t, a, a, {asc-0-in_len}
-multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-noext, a, b, {ins-in0_len-in_len-LANE2}
+multi_fn = simd_shuffle-in_len-!, a:in_t, a, a, {asc-0-in_len}
+multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in_len-LANE2}
 a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
 b = 0, MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 n = 0:1
@@ -777,8 +777,8 @@ lane-suffixes
 constn = LANE1:LANE2
 multi_fn = static_assert_imm-in0_exp_len-LANE1
 multi_fn = static_assert_imm-in_exp_len-LANE2
-multi_fn = simd_shuffle-in_len-noext, a:in_t, a, a, {asc-0-in_len}
-multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-noext, a, b, {ins-in0_len-in_len-LANE2}
+multi_fn = simd_shuffle-in_len-!, a:in_t, a, a, {asc-0-in_len}
+multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in_len-LANE2}
 a = 1., 2., 3., 4.
 b = 0., 0.5, 0., 0.
 n = 0:1
@@ -793,8 +793,8 @@ lane-suffixes
 constn = LANE1:LANE2
 multi_fn = static_assert_imm-in0_exp_len-LANE1
 multi_fn = static_assert_imm-in_exp_len-LANE2
-multi_fn = simd_shuffle-in0_len-noext, b:in_t0, b, b, {asc-0-in0_len}
-multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-noext, a, b, {ins-in0_len-in0_len-LANE2}
+multi_fn = simd_shuffle-in0_len-!, b:in_t0, b, b, {asc-0-in0_len}
+multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2}
 a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
 b = 0, MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 n = 0:1
@@ -811,8 +811,8 @@ lane-suffixes
 constn = LANE1:LANE2
 multi_fn = static_assert_imm-in0_exp_len-LANE1
 multi_fn = static_assert_imm-in_exp_len-LANE2
-multi_fn = simd_shuffle-in0_len-noext, b:in_t0, b, b, {asc-0-in0_len}
-multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-noext, a, b, {ins-in0_len-in0_len-LANE2}
+multi_fn = simd_shuffle-in0_len-!, b:in_t0, b, b, {asc-0-in0_len}
+multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2}
 a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
 b = MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 n = 1:0
@@ -827,8 +827,8 @@ lane-suffixes
 constn = LANE1:LANE2
 multi_fn = static_assert_imm-in0_exp_len-LANE1
 multi_fn = static_assert_imm-in_exp_len-LANE2
-multi_fn = simd_shuffle-in0_len-noext, b:in_t0, b, b, {asc-0-in0_len}
-multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-noext, a, b, {ins-in0_len-in0_len-LANE2}
+multi_fn = simd_shuffle-in0_len-!, b:in_t0, b, b, {asc-0-in0_len}
+multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2}
 a = 1., 2., 3., 4.
 b = 0.5, 0., 0., 0.
 n = 1:0
@@ -897,7 +897,7 @@ generate float32x2_t:float64x2_t
 /// Floating-point convert to higher precision long
 name = vcvt_high
 noq-double-suffixes
-multi_fn = simd_shuffle2, b:float32x2_t, a, a, [2, 3]
+multi_fn = simd_shuffle2!, b:float32x2_t, a, a, [2, 3]
 multi_fn = simd_cast, b
 a = -1.2, 1.2, 2.3, 3.4
 validate 2.3f32 as f64, 3.4f32 as f64
@@ -918,7 +918,7 @@ generate float64x2_t:float32x2_t
 /// Floating-point convert to lower precision narrow
 name = vcvt_high
 noq-double-suffixes
-multi_fn = simd_shuffle4, a, {simd_cast, b}, [0, 1, 2, 3]
+multi_fn = simd_shuffle4!, a, {simd_cast, b}, [0, 1, 2, 3]
 a = -1.2, 1.2
 b = -2.3, 3.4
 validate -1.2, 1.2, -2.3f64 as f32, 3.4f64 as f32
@@ -939,7 +939,7 @@ generate float64x2_t:float32x2_t
 /// Floating-point convert to lower precision narrow, rounding to odd
 name = vcvtx_high
 noq-double-suffixes
-multi_fn = simd_shuffle4, a, {vcvtx-noq_doubleself-noext, b}, [0, 1, 2, 3]
+multi_fn = simd_shuffle4!, a, {vcvtx-noq_doubleself-noext, b}, [0, 1, 2, 3]
 a = -1.0, 2.0
 b = -3.0, 4.0
 validate -1.0, 2.0, -3.0, 4.0
@@ -1162,7 +1162,7 @@ name = vdup
 lane-suffixes
 constn = N
 multi_fn = static_assert_imm-in_exp_len-N
-multi_fn = simd_shuffle-out_len-noext, a, a, {dup-out_len-N as u32}
+multi_fn = simd_shuffle-out_len-!, a, a, {dup-out_len-N as u32}
 a = 1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16
 n = HFLEN
 validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
@@ -1188,7 +1188,7 @@ name = vdup
 lane-suffixes
 constn = N
 multi_fn = static_assert_imm-in_exp_len-N
-multi_fn = simd_shuffle-out_len-noext, a, a, {dup-out_len-N as u32}
+multi_fn = simd_shuffle-out_len-!, a, a, {dup-out_len-N as u32}
 a = 1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16
 n = HFLEN
 validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
@@ -1202,7 +1202,7 @@ name = vdup
 lane-suffixes
 constn = N
 multi_fn = static_assert_imm-in_exp_len-N
-multi_fn = simd_shuffle-out_len-noext, a, a, {dup-out_len-N as u32}
+multi_fn = simd_shuffle-out_len-!, a, a, {dup-out_len-N as u32}
 a = 1., 1., 1., 4.
 n = HFLEN
 validate 1., 1., 1., 1.
@@ -1303,7 +1303,7 @@ generate float32x2_t:f32, float32x4_t:f32, float64x1_t:f64, float64x2_t:f64
 name = vext
 constn = N
 multi_fn = static_assert_imm-out_exp_len-N
-multi_fn = matchn-out_exp_len-N, simd_shuffle-out_len-noext, a, b, {asc-n-out_len}
+multi_fn = matchn-out_exp_len-N, simd_shuffle-out_len-!, a, b, {asc-n-out_len}
 a = 0, 8, 8, 9, 8, 9, 9, 11, 8, 9, 9, 11, 9, 11, 14, 15
 b = 9, 11, 14, 15, 16, 17, 18, 19, 0, 8, 8, 9, 8, 9, 9, 11
 n = HFLEN
@@ -1317,7 +1317,7 @@ generate int*_t, uint*_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t
 name = vext
 constn = N
 multi_fn = static_assert_imm-out_exp_len-N
-multi_fn = matchn-out_exp_len-N, simd_shuffle-out_len-noext, a, b, {asc-n-out_len}
+multi_fn = matchn-out_exp_len-N, simd_shuffle-out_len-!, a, b, {asc-n-out_len}
 a = 0, 8, 8, 9, 8, 9, 9, 11, 8, 9, 9, 11, 9, 11, 14, 15
 b = 9, 11, 14, 15, 16, 17, 18, 19, 0, 8, 8, 9, 8, 9, 9, 11
 n = HFLEN
@@ -1333,7 +1333,7 @@ generate int64x2_t, uint64x2_t
 name = vext
 constn = N
 multi_fn = static_assert_imm-out_exp_len-N
-multi_fn = matchn-out_exp_len-N, simd_shuffle-out_len-noext, a, b, {asc-n-out_len}
+multi_fn = matchn-out_exp_len-N, simd_shuffle-out_len-!, a, b, {asc-n-out_len}
 a = 0., 2., 2., 3.
 b = 3., 4., 5., 6.,
 n = HFLEN
@@ -1403,7 +1403,7 @@ name = vmla
 in2-lane-suffixes
 constn = LANE
 multi_fn = static_assert_imm-in2_exp_len-LANE
-multi_fn = vmla-self-noext, a, b, {simd_shuffle-in_len-noext, c, c, {dup-in_len-LANE as u32}}
+multi_fn = vmla-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
 a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
 c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
@@ -1422,7 +1422,7 @@ name = vmla
 in2-lane-suffixes
 constn = LANE
 multi_fn = static_assert_imm-in2_exp_len-LANE
-multi_fn = vmla-self-noext, a, b, {simd_shuffle-in_len-noext, c, c, {dup-in_len-LANE as u32}}
+multi_fn = vmla-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
 a = 0., 1., 2., 3.
 b = 2., 2., 2., 2.
 c = 0., 3., 0., 0.
@@ -1477,7 +1477,7 @@ name = vmlal_lane
 in2-suffix
 constn = LANE
 multi_fn = static_assert_imm-in2_exp_len-LANE
-multi_fn = vmlal-self-noext, a, b, {simd_shuffle-in_len-noext, c, c, {dup-in_len-LANE as u32}}
+multi_fn = vmlal-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
 a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
 c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
@@ -1495,8 +1495,8 @@ generate uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x2_t:uint
 /// Signed multiply-add long
 name = vmlal_high
 no-q
-multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {fixed-half-right}
-multi_fn = simd_shuffle-out_len-noext, c:half, c, c, {fixed-half-right}
+multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right}
+multi_fn = simd_shuffle-out_len-!, c:half, c, c, {fixed-half-right}
 multi_fn = vmlal-noqself-noext, a, b, c
 a = 8, 7, 6, 5, 4, 3, 2, 1
 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
@@ -1510,8 +1510,8 @@ generate int16x8_t:int8x16_t:int8x16_t:int16x8_t, int32x4_t:int16x8_t:int16x8_t:
 /// Unsigned multiply-add long
 name = vmlal_high
 no-q
-multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {fixed-half-right}
-multi_fn = simd_shuffle-out_len-noext, c:half, c, c, {fixed-half-right}
+multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right}
+multi_fn = simd_shuffle-out_len-!, c:half, c, c, {fixed-half-right}
 multi_fn = vmlal-noqself-noext, a, b, c
 a = 8, 7, 6, 5, 4, 3, 2, 1
 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
@@ -1541,7 +1541,7 @@ name = vmlal_high_lane
 in2-suffix
 constn = LANE
 multi_fn = static_assert_imm-in2_exp_len-LANE
-multi_fn = vmlal_high-noqself-noext, a, b, {simd_shuffle-in_len-noext, c, c, {dup-in_len-LANE as u32}}
+multi_fn = vmlal_high-noqself-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
 a = 8, 7, 6, 5, 4, 3, 2, 1
 b = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7
 c = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
@@ -1613,7 +1613,7 @@ name = vmls
 in2-lane-suffixes
 constn = LANE
 multi_fn = static_assert_imm-in2_exp_len-LANE
-multi_fn = vmls-self-noext, a, b, {simd_shuffle-in_len-noext, c, c, {dup-in_len-LANE as u32}}
+multi_fn = vmls-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
 a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
 c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
@@ -1632,7 +1632,7 @@ name = vmls
 in2-lane-suffixes
 constn = LANE
 multi_fn = static_assert_imm-in2_exp_len-LANE
-multi_fn = vmls-self-noext, a, b, {simd_shuffle-in_len-noext, c, c, {dup-in_len-LANE as u32}}
+multi_fn = vmls-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
 a = 6., 7., 8., 9.
 b = 2., 2., 2., 2.
 c = 0., 3., 0., 0.
@@ -1687,7 +1687,7 @@ name = vmlsl_lane
 in2-suffix
 constn = LANE
 multi_fn = static_assert_imm-in2_exp_len-LANE
-multi_fn = vmlsl-self-noext, a, b, {simd_shuffle-in_len-noext, c, c, {dup-in_len-LANE as u32}}
+multi_fn = vmlsl-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
 a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
 c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
@@ -1705,8 +1705,8 @@ generate uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x2_t:uint
 /// Signed multiply-subtract long
 name = vmlsl_high
 no-q
-multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {fixed-half-right}
-multi_fn = simd_shuffle-out_len-noext, c:half, c, c, {fixed-half-right}
+multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right}
+multi_fn = simd_shuffle-out_len-!, c:half, c, c, {fixed-half-right}
 multi_fn = vmlsl-noqself-noext, a, b, c
 a = 14, 15, 16, 17, 18, 19, 20, 21
 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
@@ -1720,8 +1720,8 @@ generate int16x8_t:int8x16_t:int8x16_t:int16x8_t, int32x4_t:int16x8_t:int16x8_t:
 /// Unsigned multiply-subtract long
 name = vmlsl_high
 no-q
-multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {fixed-half-right}
-multi_fn = simd_shuffle-out_len-noext, c:half, c, c, {fixed-half-right}
+multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right}
+multi_fn = simd_shuffle-out_len-!, c:half, c, c, {fixed-half-right}
 multi_fn = vmlsl-noqself-noext, a, b, c
 a = 14, 15, 16, 17, 18, 19, 20, 21
 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
@@ -1751,7 +1751,7 @@ name = vmlsl_high_lane
 in2-suffix
 constn = LANE
 multi_fn = static_assert_imm-in2_exp_len-LANE
-multi_fn = vmlsl_high-noqself-noext, a, b, {simd_shuffle-in_len-noext, c, c, {dup-in_len-LANE as u32}}
+multi_fn = vmlsl_high-noqself-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
 a = 14, 15, 16, 17, 18, 19, 20, 21
 b = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7
 c = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
@@ -1769,7 +1769,7 @@ generate uint64x2_t:uint32x4_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x4_t:uint
 name = vmovn_high
 no-q
 multi_fn = simd_cast, c:in_t0, b
-multi_fn = simd_shuffle-out_len-noext, a, c, {asc-0-out_len}
+multi_fn = simd_shuffle-out_len-!, a, c, {asc-0-out_len}
 a = 0, 1, 2, 3, 2, 3, 4, 5
 b = 2, 3, 4, 5, 12, 13, 14, 15
 validate 0, 1, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 12, 13, 14, 15
@@ -2070,7 +2070,7 @@ name = vmul
 lane-suffixes
 constn = LANE
 multi_fn = static_assert_imm-in_exp_len-LANE
-multi_fn = simd_mul, a, {simd_shuffle-out_len-noext, b, b, {dup-out_len-LANE as u32}}
+multi_fn = simd_mul, a, {simd_shuffle-out_len-!, b, b, {dup-out_len-LANE as u32}}
 a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
 b = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 n = 1
@@ -2102,7 +2102,7 @@ name = vmul
 lane-suffixes
 constn = LANE
 multi_fn = static_assert_imm-in_exp_len-LANE
-multi_fn = simd_mul, a, {simd_shuffle-out_len-noext, b, b, {dup-out_len-LANE as u32}}
+multi_fn = simd_mul, a, {simd_shuffle-out_len-!, b, b, {dup-out_len-LANE as u32}}
 a = 1., 2., 3., 4.
 b = 2., 0., 0., 0.
 n = 0
@@ -2155,8 +2155,8 @@ generate int8x8_t:int8x8_t:int16x8_t, int16x4_t:int16x4_t:int32x4_t, int32x2_t:i
 /// Signed multiply long
 name = vmull_high
 no-q
-multi_fn = simd_shuffle-out_len-noext, a:half, a, a, {fixed-half-right}
-multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {fixed-half-right}
+multi_fn = simd_shuffle-out_len-!, a:half, a, a, {fixed-half-right}
+multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right}
 multi_fn = vmull-noqself-noext, a, b
 a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16
 b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
@@ -2181,8 +2181,8 @@ generate uint8x8_t:uint8x8_t:uint16x8_t, uint16x4_t:uint16x4_t:uint32x4_t, uint3
 /// Unsigned multiply long
 name = vmull_high
 no-q
-multi_fn = simd_shuffle-out_len-noext, a:half, a, a, {fixed-half-right}
-multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {fixed-half-right}
+multi_fn = simd_shuffle-out_len-!, a:half, a, a, {fixed-half-right}
+multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right}
 multi_fn = vmull-noqself-noext, a, b
 a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16
 b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
@@ -2222,8 +2222,8 @@ link-arm = vmullp.v2i64:int64x1_t:int64x1_t:int64x1_t:int64x2_t
 /// Polynomial multiply long
 name = vmull_high
 no-q
-multi_fn = simd_shuffle-out_len-noext, a:half, a, a, {fixed-half-right}
-multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {fixed-half-right}
+multi_fn = simd_shuffle-out_len-!, a:half, a, a, {fixed-half-right}
+multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right}
 multi_fn = vmull-noqself-noext, a, b
 a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16
 b = 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3
@@ -2263,7 +2263,7 @@ generate uint16x4_t:u16:uint32x4_t, uint32x2_t:u32:uint64x2_t
 name = vmull_lane
 constn = LANE
 multi_fn = static_assert_imm-in_exp_len-LANE
-multi_fn = vmull-in0-noext, a, {simd_shuffle-in0_len-noext, b, b, {dup-in0_len-LANE as u32}}
+multi_fn = vmull-in0-noext, a, {simd_shuffle-in0_len-!, b, b, {dup-in0_len-LANE as u32}}
 a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
 b = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 n = 1
@@ -2294,7 +2294,7 @@ generate uint16x8_t:u16:uint32x4_t, uint32x4_t:u32:uint64x2_t
 name = vmull_high_lane
 constn = LANE
 multi_fn = static_assert_imm-in_exp_len-LANE
-multi_fn = vmull_high-noqself-noext, a, {simd_shuffle-in0_len-noext, b, b, {dup-in0_len-LANE as u32}}
+multi_fn = vmull_high-noqself-noext, a, {simd_shuffle-in0_len-!, b, b, {dup-in0_len-LANE as u32}}
 a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16
 b = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 n = 1
@@ -2336,7 +2336,7 @@ name = vmulx
 lane-suffixes
 constn = LANE
 multi_fn = static_assert_imm-in_exp_len-LANE
-multi_fn = vmulx-in0-noext, a, {simd_shuffle-in0_len-noext, b, b, {dup-in0_len-LANE as u32}}
+multi_fn = vmulx-in0-noext, a, {simd_shuffle-in0_len-!, b, b, {dup-in0_len-LANE as u32}}
 a = 1., 2., 3., 4.
 b = 2., 0., 0., 0.
 n = 0
@@ -2573,7 +2573,7 @@ generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t
 name = vsubhn_high
 no-q
 multi_fn = vsubhn-noqself-noext, d:in_t0, b, c
-multi_fn = simd_shuffle-out_len-noext, a, d, {asc-0-out_len}
+multi_fn = simd_shuffle-out_len-!, a, d, {asc-0-out_len}
 a = MAX, 0, MAX, 0, MAX, 0, MAX, 0
 b = MAX, 1, MAX, 1, MAX, 1, MAX, 1
 c = 1, 0, 1, 0, 1, 0, 1, 0
@@ -2629,7 +2629,7 @@ generate uint16x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint32x4_t, uint
 /// Signed Subtract Wide
 name = vsubw_high
 no-q
-multi_fn = simd_shuffle8, c:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle8!, c:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
 multi_fn = simd_sub, a, {simd_cast, c}
 a = 8, 9, 10, 12, 13, 14, 15, 16
 b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16
@@ -2641,7 +2641,7 @@ generate int16x8_t:int8x16_t:int16x8_t
 /// Signed Subtract Wide
 name = vsubw_high
 no-q
-multi_fn = simd_shuffle4, c:int16x4_t, b, b, [4, 5, 6, 7]
+multi_fn = simd_shuffle4!, c:int16x4_t, b, b, [4, 5, 6, 7]
 multi_fn = simd_sub, a, {simd_cast, c}
 a = 8, 9, 10, 11
 b = 0, 1, 2, 3, 8, 9, 10, 11
@@ -2653,7 +2653,7 @@ generate int32x4_t:int16x8_t:int32x4_t
 /// Signed Subtract Wide
 name = vsubw_high
 no-q
-multi_fn = simd_shuffle2, c:int32x2_t, b, b, [2, 3]
+multi_fn = simd_shuffle2!, c:int32x2_t, b, b, [2, 3]
 multi_fn = simd_sub, a, {simd_cast, c}
 a = 8, 9
 b = 6, 7, 8, 9
@@ -2665,7 +2665,7 @@ generate int64x2_t:int32x4_t:int64x2_t
 /// Unsigned Subtract Wide
 name = vsubw_high
 no-q
-multi_fn = simd_shuffle8, c:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle8!, c:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
 multi_fn = simd_sub, a, {simd_cast, c}
 a = 8, 9, 10, 11, 12, 13, 14, 15
 b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
@@ -2677,7 +2677,7 @@ generate uint16x8_t:uint8x16_t:uint16x8_t
 /// Unsigned Subtract Wide
 name = vsubw_high
 no-q
-multi_fn = simd_shuffle4, c:uint16x4_t, b, b, [4, 5, 6, 7]
+multi_fn = simd_shuffle4!, c:uint16x4_t, b, b, [4, 5, 6, 7]
 multi_fn = simd_sub, a, {simd_cast, c}
 a = 8, 9, 10, 11
 b = 0, 1, 2, 3, 8, 9, 10, 11
@@ -2689,7 +2689,7 @@ generate uint32x4_t:uint16x8_t:uint32x4_t
 /// Unsigned Subtract Wide
 name = vsubw_high
 no-q
-multi_fn = simd_shuffle2, c:uint32x2_t, b, b, [2, 3]
+multi_fn = simd_shuffle2!, c:uint32x2_t, b, b, [2, 3]
 multi_fn = simd_sub, a, {simd_cast, c}
 a = 8, 9
 b = 6, 7, 8, 9
@@ -2731,9 +2731,9 @@ generate uint8x8_t:uint8x8_t:uint16x8_t, uint16x4_t:uint16x4_t:uint32x4_t, uint3
 /// Signed Subtract Long
 name = vsubl_high
 no-q
-multi_fn = simd_shuffle8, c:int8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle8!, c:int8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15]
 multi_fn = simd_cast, d:out_t, c
-multi_fn = simd_shuffle8, e:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle8!, e:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
 multi_fn = simd_cast, f:out_t, e
 multi_fn = simd_sub, d, f

@@ -2747,9 +2747,9 @@ generate int8x16_t:int8x16_t:int16x8_t
 /// Signed Subtract Long
 name = vsubl_high
 no-q
-multi_fn = simd_shuffle4, c:int16x4_t, a, a, [4, 5, 6, 7]
+multi_fn = simd_shuffle4!, c:int16x4_t, a, a, [4, 5, 6, 7]
 multi_fn = simd_cast, d:out_t, c
-multi_fn = simd_shuffle4, e:int16x4_t, b, b, [4, 5, 6, 7]
+multi_fn = simd_shuffle4!, e:int16x4_t, b, b, [4, 5, 6, 7]
 multi_fn = simd_cast, f:out_t, e
 multi_fn = simd_sub, d, f

@@ -2763,9 +2763,9 @@ generate int16x8_t:int16x8_t:int32x4_t
 /// Signed Subtract Long
 name = vsubl_high
 no-q
-multi_fn = simd_shuffle2, c:int32x2_t, a, a, [2, 3]
+multi_fn = simd_shuffle2!, c:int32x2_t, a, a, [2, 3]
 multi_fn = simd_cast, d:out_t, c
-multi_fn = simd_shuffle2, e:int32x2_t, b, b, [2, 3]
+multi_fn = simd_shuffle2!, e:int32x2_t, b, b, [2, 3]
 multi_fn = simd_cast, f:out_t, e
 multi_fn = simd_sub, d, f

@@ -2779,9 +2779,9 @@ generate int32x4_t:int32x4_t:int64x2_t
 /// Unsigned Subtract Long
 name = vsubl_high
 no-q
-multi_fn = simd_shuffle8, c:uint8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle8!, c:uint8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15]
 multi_fn = simd_cast, d:out_t, c
-multi_fn = simd_shuffle8, e:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle8!, e:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
 multi_fn = simd_cast, f:out_t, e
 multi_fn = simd_sub, d, f

@@ -2795,9 +2795,9 @@ generate uint8x16_t:uint8x16_t:uint16x8_t
 /// Unsigned Subtract Long
 name = vsubl_high
 no-q
-multi_fn = simd_shuffle4, c:uint16x4_t, a, a, [4, 5, 6, 7]
+multi_fn = simd_shuffle4!, c:uint16x4_t, a, a, [4, 5, 6, 7]
 multi_fn = simd_cast, d:out_t, c
-multi_fn = simd_shuffle4, e:uint16x4_t, b, b, [4, 5, 6, 7]
+multi_fn = simd_shuffle4!, e:uint16x4_t, b, b, [4, 5, 6, 7]
 multi_fn = simd_cast, f:out_t, e
 multi_fn = simd_sub, d, f

@@ -2811,9 +2811,9 @@ generate uint16x8_t:uint16x8_t:uint32x4_t
 /// Unsigned Subtract Long
 name = vsubl_high
 no-q
-multi_fn = simd_shuffle2, c:uint32x2_t, a, a, [2, 3]
+multi_fn = simd_shuffle2!, c:uint32x2_t, a, a, [2, 3]
 multi_fn = simd_cast, d:out_t, c
-multi_fn = simd_shuffle2, e:uint32x2_t, b, b, [2, 3]
+multi_fn = simd_shuffle2!, e:uint32x2_t, b, b, [2, 3]
 multi_fn = simd_cast, f:out_t, e
 multi_fn = simd_sub, d, f

@@ -3011,8 +3011,8 @@ generate int16x4_t:i16:int32x4_t, int32x2_t:i32:int64x2_t
 /// Signed saturating doubling multiply long
 name = vqdmull_high
 no-q
-multi_fn = simd_shuffle-out_len-noext, a:half, a, a, {asc-halflen-halflen}
-multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {asc-halflen-halflen}
+multi_fn = simd_shuffle-out_len-!, a:half, a, a, {asc-halflen-halflen}
+multi_fn = simd_shuffle-out_len-!, b:half, b, b, {asc-halflen-halflen}
 multi_fn = vqdmull-noqself-noext, a, b
 a = 0, 1, 4, 5, 4, 5, 6, 7
 b = 1, 2, 5, 6, 5, 6, 7, 8
@@ -3024,7 +3024,7 @@ generate int16x8_t:int16x8_t:int32x4_t, int32x4_t:int32x4_t:int64x2_t
 /// Signed saturating doubling multiply long
 name = vqdmull_high_n
 no-q
-multi_fn = simd_shuffle-out_len-noext, a:in_ntt, a, a, {asc-out_len-out_len}
+multi_fn = simd_shuffle-out_len-!, a:in_ntt, a, a, {asc-out_len-out_len}
 multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b
 multi_fn = vqdmull-in_ntt-noext, a, b
 a = 0, 2, 8, 10, 8, 10, 12, 14
@@ -3038,7 +3038,7 @@ generate int16x8_t:i16:int32x4_t, int32x4_t:i32:int64x2_t
 name = vqdmull_lane
 constn = N
 multi_fn = static_assert_imm-in_exp_len-N
-multi_fn = simd_shuffle-out_len-noext, b:in_t0, b, b, {dup-out_len-N as u32}
+multi_fn = simd_shuffle-out_len-!, b:in_t0, b, b, {dup-out_len-N as u32}
 multi_fn = vqdmull-noqself-noext, a, b
 a = 1, 2, 3, 4
 b = 0, 2, 2, 0, 2, 0, 0, 0
@@ -3083,8 +3083,8 @@ generate i32:int32x2_t:i64, i32:int32x4_t:i64
 name = vqdmull_high_lane
 constn = N
 multi_fn = static_assert_imm-in_exp_len-N
-multi_fn = simd_shuffle-out_len-noext, a:in_t, a, a, {asc-out_len-out_len}
-multi_fn = simd_shuffle-out_len-noext, b:in_t, b, b, {dup-out_len-N as u32}
+multi_fn = simd_shuffle-out_len-!, a:in_t, a, a, {asc-out_len-out_len}
+multi_fn = simd_shuffle-out_len-!, b:in_t, b, b, {dup-out_len-N as u32}
 multi_fn = vqdmull-self-noext, a, b
 a = 0, 1, 4, 5, 4, 5, 6, 7
 b = 0, 2, 2, 0, 2, 0, 0, 0
@@ -3098,8 +3098,8 @@ generate int16x8_t:int16x4_t:int32x4_t, int32x4_t:int32x2_t:int64x2_t
 name = vqdmull_high_lane
 constn = N
 multi_fn = static_assert_imm-in_exp_len-N
-multi_fn = simd_shuffle-out_len-noext, a:half, a, a, {asc-out_len-out_len}
-multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {dup-out_len-N as u32}
+multi_fn = simd_shuffle-out_len-!, a:half, a, a, {asc-out_len-out_len}
+multi_fn = simd_shuffle-out_len-!, b:half, b, b, {dup-out_len-N as u32}
 multi_fn = vqdmull-noqself-noext, a, b
 a = 0, 1, 4, 5, 4, 5, 6, 7
 b = 0, 2, 2, 0, 2, 0, 0, 0
@@ -3390,7 +3390,7 @@ name = vqrdmulh
 lane-suffixes
 constn = LANE
 multi_fn = static_assert_imm-in_exp_len-LANE
-multi_fn = simd_shuffle-out_len-noext, b:out_t, b, b, {dup-out_len-LANE as u32}
+multi_fn = simd_shuffle-out_len-!, b:out_t, b, b, {dup-out_len-LANE as u32}
 multi_fn = vqrdmulh-out-noext, a, b
 a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX
 b = 0, 2, 0, 0, 0, 0, 0, 0,
@@ -3616,7 +3616,7 @@ name = vqrshrn_high
 noq-n-suffix
 constn = N
 multi_fn = static_assert-N-1-halfbits
-multi_fn = simd_shuffle-out_len-noext, a, {vqrshrn_n-noqself-::<N>, b}, {asc-0-out_len}
+multi_fn = simd_shuffle-out_len-!, a, {vqrshrn_n-noqself-::<N>, b}, {asc-0-out_len}
 a = 0, 1, 2, 3, 2, 3, 6, 7
 b = 8, 12, 24, 28, 48, 52, 56, 60
 n = 2
@@ -3662,7 +3662,7 @@ name = vqrshrn_high
 noq-n-suffix
 constn = N
 multi_fn = static_assert-N-1-halfbits
-multi_fn = simd_shuffle-out_len-noext, a, {vqrshrn_n-noqself-::<N>, b}, {asc-0-out_len}
+multi_fn = simd_shuffle-out_len-!, a, {vqrshrn_n-noqself-::<N>, b}, {asc-0-out_len}
 a = 0, 1, 2, 3, 2, 3, 6, 7
 b = 8, 12, 24, 28, 48, 52, 56, 60
 n = 2
@@ -3708,7 +3708,7 @@ name = vqrshrun_high
 noq-n-suffix
 constn = N
 multi_fn = static_assert-N-1-halfbits
-multi_fn = simd_shuffle-out_len-noext, a, {vqrshrun_n-noqself-::<N>, b}, {asc-0-out_len}
+multi_fn = simd_shuffle-out_len-!, a, {vqrshrun_n-noqself-::<N>, b}, {asc-0-out_len}
 a = 0, 1, 2, 3, 2, 3, 6, 7
 b = 8, 12, 24, 28, 48, 52, 56, 60
 n = 2
@@ -3858,7 +3858,7 @@ name = vqshrn_high
 noq-n-suffix
 constn = N
 multi_fn = static_assert-N-1-halfbits
-multi_fn = simd_shuffle-out_len-noext, a, {vqshrn_n-noqself-::<N>, b}, {asc-0-out_len}
+multi_fn = simd_shuffle-out_len-!, a, {vqshrn_n-noqself-::<N>, b}, {asc-0-out_len}
 a = 0, 1, 8, 9, 8, 9, 10, 11
 b = 32, 36, 40, 44, 48, 52, 56, 60
 n = 2
@@ -3903,7 +3903,7 @@ name = vqshrn_high
 noq-n-suffix
 constn = N
 multi_fn = static_assert-N-1-halfbits
-multi_fn = simd_shuffle-out_len-noext, a, {vqshrn_n-noqself-::<N>, b}, {asc-0-out_len}
+multi_fn = simd_shuffle-out_len-!, a, {vqshrn_n-noqself-::<N>, b}, {asc-0-out_len}
 a = 0, 1, 8, 9, 8, 9, 10, 11
 b = 32, 36, 40, 44, 48, 52, 56, 60
 n = 2
@@ -3948,7 +3948,7 @@ name = vqshrun_high
 noq-n-suffix
 constn = N
 multi_fn = static_assert-N-1-halfbits
-multi_fn = simd_shuffle-out_len-noext, a, {vqshrun_n-noqself-::<N>, b}, {asc-0-out_len}
+multi_fn = simd_shuffle-out_len-!, a, {vqshrun_n-noqself-::<N>, b}, {asc-0-out_len}
 a = 0, 1, 8, 9, 8, 9, 10, 11
 b = 32, 36, 40, 44, 48, 52, 56, 60
 n = 2
@@ -4312,7 +4312,7 @@ name = vrshrn_high
 noq-n-suffix
 constn = N
 multi_fn = static_assert-N-1-halfbits
-multi_fn = simd_shuffle-out_len-noext, a, {vrshrn_n-noqself-::<N>, b}, {asc-0-out_len}
+multi_fn = simd_shuffle-out_len-!, a, {vrshrn_n-noqself-::<N>, b}, {asc-0-out_len}
 a = 0, 1, 8, 9, 8, 9, 10, 11
 b = 32, 36, 40, 44, 48, 52, 56, 60
 n = 2
@@ -4542,7 +4542,7 @@ name = vshll_high_n
 no-q
 constn = N
 multi_fn = static_assert-N-0-bits
-multi_fn = simd_shuffle-out_len-noext, b:half, a, a, {asc-halflen-halflen}
+multi_fn = simd_shuffle-out_len-!, b:half, a, a, {asc-halflen-halflen}
 multi_fn = vshll_n-noqself-::<N>, b
 a = 0, 0, 1, 2, 1, 2, 3, 4, 1, 2, 3, 4, 5, 6, 7, 8
 n = 2
@@ -4589,7 +4589,7 @@ name = vshrn_high_n
 no-q
 constn = N
 multi_fn = static_assert-N-1-halfbits
-multi_fn = simd_shuffle-out_len-noext, a, {vshrn_n-noqself-::<N>, b}, {asc-0-out_len}
+multi_fn = simd_shuffle-out_len-!, a, {vshrn_n-noqself-::<N>, b}, {asc-0-out_len}
 a = 1, 2, 5, 6, 5, 6, 7, 8
 b = 20, 24, 28, 32, 52, 56, 60, 64
 n = 2
@@ -4631,7 +4631,7 @@ generate uint*_t, uint64x*_t

 /// Transpose vectors
 name = vtrn1
-multi_fn = simd_shuffle-in_len-noext, a, b, {transpose-1-in_len}
+multi_fn = simd_shuffle-in_len-!, a, b, {transpose-1-in_len}
 a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
 b = 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
 validate 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
@@ -4644,7 +4644,7 @@ generate int32x2_t, int64x2_t, uint32x2_t, uint64x2_t, poly64x2_t

 /// Transpose vectors
 name = vtrn1
-multi_fn = simd_shuffle-in_len-noext, a, b, {transpose-1-in_len}
+multi_fn = simd_shuffle-in_len-!, a, b, {transpose-1-in_len}
 a = 0., 2., 4., 6., 8., 10., 12., 14.
 b = 1., 3., 5., 7., 9., 11., 13., 15.
 validate 0., 1., 4., 5., 8., 9., 12., 13.
@@ -4657,7 +4657,7 @@ generate float32x2_t, float64x2_t

 /// Transpose vectors
 name = vtrn2
-multi_fn = simd_shuffle-in_len-noext, a, b, {transpose-2-in_len}
+multi_fn = simd_shuffle-in_len-!, a, b, {transpose-2-in_len}
 a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
 b = 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
 validate 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
@@ -4670,7 +4670,7 @@ generate int32x2_t, int64x2_t, uint32x2_t, uint64x2_t, poly64x2_t

 /// Transpose vectors
 name = vtrn2
-multi_fn = simd_shuffle-in_len-noext, a, b, {transpose-2-in_len}
+multi_fn = simd_shuffle-in_len-!, a, b, {transpose-2-in_len}
 a = 0., 2., 4., 6., 8., 10., 12., 14.
 b = 1., 3., 5., 7., 9., 11., 13., 15.
 validate 2., 3., 6., 7., 10., 11., 14., 15.
@@ -4683,7 +4683,7 @@ generate float32x2_t, float64x2_t

 /// Zip vectors
 name = vzip1
-multi_fn = simd_shuffle-in_len-noext, a, b, {zip-1-in_len}
+multi_fn = simd_shuffle-in_len-!, a, b, {zip-1-in_len}
 a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
 b = 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
 validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
@@ -4693,7 +4693,7 @@ generate int*_t, int64x2_t, uint*_t, uint64x2_t, poly8x8_t, poly8x16_t, poly16x4

 /// Zip vectors
 name = vzip1
-multi_fn = simd_shuffle-in_len-noext, a, b, {zip-1-in_len}
+multi_fn = simd_shuffle-in_len-!, a, b, {zip-1-in_len}
 a = 0., 2., 4., 6., 8., 10., 12., 14.
 b = 1., 3., 5., 7., 9., 11., 13., 15.
 validate 0., 1., 2., 3., 4., 5., 6., 7.
@@ -4703,7 +4703,7 @@ generate float32x2_t, float32x4_t, float64x2_t

 /// Zip vectors
 name = vzip2
-multi_fn = simd_shuffle-in_len-noext, a, b, {zip-2-in_len}
+multi_fn = simd_shuffle-in_len-!, a, b, {zip-2-in_len}
 a = 0, 16, 16, 18, 16, 18, 20, 22, 16, 18, 20, 22, 24, 26, 28, 30
 b = 1, 17, 17, 19, 17, 19, 21, 23, 17, 19, 21, 23, 25, 27, 29, 31
 validate 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
@@ -4713,7 +4713,7 @@ generate int*_t, int64x2_t, uint*_t, uint64x2_t, poly8x8_t, poly8x16_t, poly16x4

 /// Zip vectors
 name = vzip2
-multi_fn = simd_shuffle-in_len-noext, a, b, {zip-2-in_len}
+multi_fn = simd_shuffle-in_len-!, a, b, {zip-2-in_len}
 a = 0., 8., 8., 10., 8., 10., 12., 14.
 b = 1., 9., 9., 11., 9., 11., 13., 15.
 validate 8., 9., 10., 11., 12., 13., 14., 15.
@@ -4723,7 +4723,7 @@ generate float32x2_t, float32x4_t, float64x2_t

 /// Unzip vectors
 name = vuzp1
-multi_fn = simd_shuffle-in_len-noext, a, b, {unzip-1-in_len}
+multi_fn = simd_shuffle-in_len-!, a, b, {unzip-1-in_len}
 a = 1, 0, 2, 0, 2, 0, 3, 0, 2, 0, 3, 0, 7, 0, 8, 0
 b = 2, 0, 3, 0, 7, 0, 8, 0, 13, 0, 14, 0, 15, 0, 16, 0
 validate 1, 2, 2, 3, 2, 3, 7, 8, 2, 3, 7, 8, 13, 14, 15, 16
@@ -4736,7 +4736,7 @@ generate int32x2_t, int64x2_t, uint32x2_t, uint64x2_t, poly64x2_t

 /// Unzip vectors
 name = vuzp1
-multi_fn = simd_shuffle-in_len-noext, a, b, {unzip-1-in_len}
+multi_fn = simd_shuffle-in_len-!, a, b, {unzip-1-in_len}
 a = 0., 8., 1., 9., 4., 12., 5., 13.
 b = 1., 10., 3., 11., 6., 14., 7., 15.
 validate 0., 1., 1., 3., 4., 5., 6., 7.
@@ -4749,7 +4749,7 @@ generate float32x2_t, float64x2_t

 /// Unzip vectors
 name = vuzp2
-multi_fn = simd_shuffle-in_len-noext, a, b, {unzip-2-in_len}
+multi_fn = simd_shuffle-in_len-!, a, b, {unzip-2-in_len}
 a = 0, 17, 0, 18, 0, 18, 0, 19, 0, 18, 0, 19, 0, 23, 0, 24
 b = 0, 18, 0, 19, 0, 23, 0, 24, 0, 29, 0, 30, 0, 31, 0, 32
 validate 17, 18, 18, 19, 18, 19, 23, 24, 18, 19, 23, 24, 29, 30, 31, 32
@@ -4762,7 +4762,7 @@ generate int32x2_t, int64x2_t, uint32x2_t, uint64x2_t, poly64x2_t

 /// Unzip vectors
 name = vuzp2
-multi_fn = simd_shuffle-in_len-noext, a, b, {unzip-2-in_len}
+multi_fn = simd_shuffle-in_len-!, a, b, {unzip-2-in_len}
 a = 0., 8., 1., 9., 4., 12., 5., 13.
 b = 2., 9., 3., 11., 6., 14., 7., 15.
 validate 8., 9., 9., 11., 12., 13., 14., 15.
@@ -4793,8 +4793,8 @@ generate uint16x8_t:uint8x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint16
 /// Unsigned Absolute difference and Accumulate Long
 name = vabal_high
 no-q
-multi_fn = simd_shuffle8, d:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
-multi_fn = simd_shuffle8, e:uint8x8_t, c, c, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle8!, d:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle8!, e:uint8x8_t, c, c, [8, 9, 10, 11, 12, 13, 14, 15]
 multi_fn = vabd_u8, d, e, f:uint8x8_t
 multi_fn = simd_add, a, {simd_cast, f}
 a = 9, 10, 11, 12, 13, 14, 15, 16
@@ -4808,8 +4808,8 @@ generate uint16x8_t:uint8x16_t:uint8x16_t:uint16x8_t
 /// Unsigned Absolute difference and Accumulate Long
 name = vabal_high
 no-q
-multi_fn = simd_shuffle4, d:uint16x4_t, b, b, [4, 5, 6, 7]
-multi_fn = simd_shuffle4, e:uint16x4_t, c, c, [4, 5, 6, 7]
+multi_fn = simd_shuffle4!, d:uint16x4_t, b, b, [4, 5, 6, 7]
+multi_fn = simd_shuffle4!, e:uint16x4_t, c, c, [4, 5, 6, 7]
 multi_fn = vabd_u16, d, e, f:uint16x4_t
 multi_fn = simd_add, a, {simd_cast, f}
 a = 9, 10, 11, 12
@@ -4823,8 +4823,8 @@ generate uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t
 /// Unsigned Absolute difference and Accumulate Long
 name = vabal_high
 no-q
-multi_fn = simd_shuffle2, d:uint32x2_t, b, b, [2, 3]
-multi_fn = simd_shuffle2, e:uint32x2_t, c, c, [2, 3]
+multi_fn = simd_shuffle2!, d:uint32x2_t, b, b, [2, 3]
+multi_fn = simd_shuffle2!, e:uint32x2_t, c, c, [2, 3]
 multi_fn = vabd_u32, d, e, f:uint32x2_t
 multi_fn = simd_add, a, {simd_cast, f}
 a = 15, 16
@@ -4884,8 +4884,8 @@ generate int64x2_t:int32x2_t:int32x2_t:int64x2_t
 /// Signed Absolute difference and Accumulate Long
 name = vabal_high
 no-q
-multi_fn = simd_shuffle8, d:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
-multi_fn = simd_shuffle8, e:int8x8_t, c, c, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle8!, d:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle8!, e:int8x8_t, c, c, [8, 9, 10, 11, 12, 13, 14, 15]
 multi_fn = vabd_s8, d, e, f:int8x8_t
 multi_fn = simd_cast, f:uint8x8_t, f
 multi_fn = simd_add, a, {simd_cast, f}
@@ -4900,8 +4900,8 @@ generate int16x8_t:int8x16_t:int8x16_t:int16x8_t
 /// Signed Absolute difference and Accumulate Long
 name = vabal_high
 no-q
-multi_fn = simd_shuffle4, d:int16x4_t, b, b, [4, 5, 6, 7]
-multi_fn = simd_shuffle4, e:int16x4_t, c, c, [4, 5, 6, 7]
+multi_fn = simd_shuffle4!, d:int16x4_t, b, b, [4, 5, 6, 7]
+multi_fn = simd_shuffle4!, e:int16x4_t, c, c, [4, 5, 6, 7]
 multi_fn = vabd_s16, d, e, f:int16x4_t
 multi_fn = simd_cast, f:uint16x4_t, f
 multi_fn = simd_add, a, {simd_cast, f}
@@ -4916,8 +4916,8 @@ generate int32x4_t:int16x8_t:int16x8_t:int32x4_t
 /// Signed Absolute difference and Accumulate Long
 name = vabal_high
 no-q
-multi_fn = simd_shuffle2, d:int32x2_t, b, b, [2, 3]
-multi_fn = simd_shuffle2, e:int32x2_t, c, c, [2, 3]
+multi_fn = simd_shuffle2!, d:int32x2_t, b, b, [2, 3]
+multi_fn = simd_shuffle2!, e:int32x2_t, c, c, [2, 3]
 multi_fn = vabd_s32, d, e, f:int32x2_t
 multi_fn = simd_cast, f:uint32x2_t, f
 multi_fn = simd_add, a, {simd_cast, f}
@@ -988,6 +988,17 @@ fn gen_aarch64(
            );
        }
    };
+    let const_declare = if let Some(constn) = constn {
+        if constn.contains(":") {
+            let constns: Vec<_> = constn.split(':').map(|v| v.to_string()).collect();
+            assert_eq!(constns.len(), 2);
+            format!(r#"<const {}: i32, const {}: i32>"#, constns[0], constns[1])
+        } else {
+            format!(r#"<const {}: i32>"#, constn)
+        }
+    } else {
+        String::new()
+    };
    let multi_calls = if !multi_fn.is_empty() {
        let mut calls = String::new();
        for i in 0..multi_fn.len() {
@@ -997,6 +1008,7 @@ fn gen_aarch64(
            calls.push_str(&get_call(
                &multi_fn[i],
                current_name,
+                &const_declare,
                in_t,
                out_t,
                fixed,
@@ -1007,17 +1019,6 @@ fn gen_aarch64(
    } else {
        String::new()
    };
-    let const_declare = if let Some(constn) = constn {
-        if constn.contains(":") {
-            let constns: Vec<_> = constn.split(':').map(|v| v.to_string()).collect();
-            assert_eq!(constns.len(), 2);
-            format!(r#"<const {}: i32, const {}: i32>"#, constns[0], constns[1])
-        } else {
-            format!(r#"<const {}: i32>"#, constn)
-        }
-    } else {
-        String::new()
-    };
    let const_assert = if let Some(constn) = constn {
        if constn.contains(":") {
            let constns: Vec<_> = constn.split(':').map(|v| v.to_string()).collect();
@@ -1582,6 +1583,11 @@ fn gen_arm(
            ));
        }
    };
+    let const_declare = if let Some(constn) = constn {
+        format!(r#"<const {}: i32>"#, constn)
+    } else {
+        String::new()
+    };
    let multi_calls = if !multi_fn.is_empty() {
        let mut calls = String::new();
        for i in 0..multi_fn.len() {
@@ -1591,6 +1597,7 @@ fn gen_arm(
            calls.push_str(&get_call(
                &multi_fn[i],
                current_name,
+                &const_declare,
                in_t,
                out_t,
                fixed,
@@ -1601,11 +1608,6 @@ fn gen_arm(
    } else {
        String::new()
    };
-    let const_declare = if let Some(constn) = constn {
-        format!(r#"<const {}: i32>"#, constn)
-    } else {
-        String::new()
-    };
    let const_assert = if let Some(constn) = constn {
        format!(
            r#", {} = {}"#,
@@ -2003,6 +2005,7 @@ fn expand_intrinsic(intr: &str, t: &str) -> String {
 fn get_call(
    in_str: &str,
    current_name: &str,
+    const_declare: &str,
    in_t: &[&str; 3],
    out_t: &str,
    fixed: &Vec<String>,
@@ -2041,7 +2044,7 @@ fn get_call(
            "halflen" => type_len(in_t[1]) / 2,
            _ => 0,
        };
-        let mut s = String::from("[");
+        let mut s = format!("{} [", const_declare);
        for i in 0..len {
            if i != 0 {
                s.push_str(", ");
@@ -2084,7 +2087,7 @@ fn get_call(
            "in0_len" => type_len(in_t[0]),
            _ => 0,
        };
-        let mut s = String::from("[");
+        let mut s = format!("{} [", const_declare);
        for i in 0..len {
            if i != 0 {
                s.push_str(", ");
@@ -2167,7 +2170,15 @@ fn get_call(
            let sub_match = format!(
                "        {} => {},\n",
                i,
-                get_call(&sub_call, current_name, in_t, out_t, fixed, Some(i as i32))
+                get_call(
+                    &sub_call,
+                    current_name,
+                    const_declare,
+                    in_t,
+                    out_t,
+                    fixed,
+                    Some(i as i32)
+                )
            );
            call.push_str(&sub_match);
        }
@@ -2210,6 +2221,7 @@ fn get_call(
            let sub_call = get_call(
                &sub_fn[1..sub_fn.len() - 1],
                current_name,
+                const_declare,
                in_t,
                out_t,
                fixed,