update C headers to LLVM 16

upstream commit 0604154e006e88e9e7f82d8ee5fd076bda206613
This commit is contained in:
Andrew Kelley
2023-01-26 13:15:35 -07:00
parent b260bf42d3
commit 85be0b8c65
59 changed files with 21633 additions and 19318 deletions
+2
View File
@@ -666,6 +666,7 @@ __device__ static void __tex_fetch(__T *__ptr, cudaTextureObject_t __handle,
__tex_fetch_v4<__op>::template __run<__FetchT>(__handle, __args...));
}
#if CUDA_VERSION < 12000
// texture<> objects get magically converted into a texture reference. However,
// there's no way to convert them to cudaTextureObject_t on C++ level. So, we
// cheat a bit and use inline assembly to do it. It costs us an extra register
@@ -713,6 +714,7 @@ __tex_fetch(__DataT *, __RetT *__ptr,
__tex_fetch_v4<__op>::template __run<__FetchT>(
__tex_handle_to_obj(__handle), __args...));
}
#endif // CUDA_VERSION
} // namespace __cuda_tex
} // namespace
#pragma pop_macro("__ASM_OUT")
+5
View File
@@ -288,12 +288,17 @@ __llvm_amdgcn_rsq_f64(double __x) {
__device__ __attribute__((const)) _Float16 __ocml_ceil_f16(_Float16);
__device__ _Float16 __ocml_cos_f16(_Float16);
__device__ __attribute__((const)) _Float16 __ocml_cvtrtn_f16_f32(float);
__device__ __attribute__((const)) _Float16 __ocml_cvtrtp_f16_f32(float);
__device__ __attribute__((const)) _Float16 __ocml_cvtrtz_f16_f32(float);
__device__ __attribute__((pure)) _Float16 __ocml_exp_f16(_Float16);
__device__ __attribute__((pure)) _Float16 __ocml_exp10_f16(_Float16);
__device__ __attribute__((pure)) _Float16 __ocml_exp2_f16(_Float16);
__device__ __attribute__((const)) _Float16 __ocml_floor_f16(_Float16);
__device__ __attribute__((const)) _Float16 __ocml_fma_f16(_Float16, _Float16,
_Float16);
__device__ __attribute__((const)) _Float16 __ocml_fmax_f16(_Float16, _Float16);
__device__ __attribute__((const)) _Float16 __ocml_fmin_f16(_Float16, _Float16);
__device__ __attribute__((const)) _Float16 __ocml_fabs_f16(_Float16);
__device__ __attribute__((const)) int __ocml_isinf_f16(_Float16);
__device__ __attribute__((const)) int __ocml_isnan_f16(_Float16);
+10 -13
View File
@@ -70,9 +70,9 @@ __DEVICE__ void __static_assert_equal_size() {
#endif
__DEVICE__
uint64_t __make_mantissa_base8(const char *__tagp) {
uint64_t __make_mantissa_base8(const char *__tagp __attribute__((nonnull))) {
uint64_t __r = 0;
while (__tagp) {
while (*__tagp != '\0') {
char __tmp = *__tagp;
if (__tmp >= '0' && __tmp <= '7')
@@ -87,9 +87,9 @@ uint64_t __make_mantissa_base8(const char *__tagp) {
}
__DEVICE__
uint64_t __make_mantissa_base10(const char *__tagp) {
uint64_t __make_mantissa_base10(const char *__tagp __attribute__((nonnull))) {
uint64_t __r = 0;
while (__tagp) {
while (*__tagp != '\0') {
char __tmp = *__tagp;
if (__tmp >= '0' && __tmp <= '9')
@@ -104,9 +104,9 @@ uint64_t __make_mantissa_base10(const char *__tagp) {
}
__DEVICE__
uint64_t __make_mantissa_base16(const char *__tagp) {
uint64_t __make_mantissa_base16(const char *__tagp __attribute__((nonnull))) {
uint64_t __r = 0;
while (__tagp) {
while (*__tagp != '\0') {
char __tmp = *__tagp;
if (__tmp >= '0' && __tmp <= '9')
@@ -125,10 +125,7 @@ uint64_t __make_mantissa_base16(const char *__tagp) {
}
__DEVICE__
uint64_t __make_mantissa(const char *__tagp) {
if (!__tagp)
return 0u;
uint64_t __make_mantissa(const char *__tagp __attribute__((nonnull))) {
if (*__tagp == '0') {
++__tagp;
@@ -233,7 +230,7 @@ __DEVICE__
float expm1f(float __x) { return __ocml_expm1_f32(__x); }
__DEVICE__
float fabsf(float __x) { return __ocml_fabs_f32(__x); }
float fabsf(float __x) { return __builtin_fabsf(__x); }
__DEVICE__
float fdimf(float __x, float __y) { return __ocml_fdim_f32(__x, __y); }
@@ -359,7 +356,7 @@ float modff(float __x, float *__iptr) {
}
__DEVICE__
float nanf(const char *__tagp) {
float nanf(const char *__tagp __attribute__((nonnull))) {
union {
float val;
struct ieee_float {
@@ -792,7 +789,7 @@ __DEVICE__
double expm1(double __x) { return __ocml_expm1_f64(__x); }
__DEVICE__
double fabs(double __x) { return __ocml_fabs_f64(__x); }
double fabs(double __x) { return __builtin_fabs(__x); }
__DEVICE__
double fdim(double __x, double __y) { return __ocml_fdim_f64(__x, __y); }
+1
View File
@@ -113,6 +113,7 @@ __attribute__((weak)) inline __device__ void free(void *__ptr) {
#include <__clang_hip_libdevice_declares.h>
#include <__clang_hip_math.h>
#include <__clang_hip_stdlib.h>
#if defined(__HIPCC_RTC__)
#include <__clang_hip_cmath.h>
+43
View File
@@ -0,0 +1,43 @@
/*===---- __clang_hip_stdlib.h - Device-side HIP math support --------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __CLANG_HIP_STDLIB_H__
#if !defined(__HIP__) && !defined(__OPENMP_AMDGCN__)
#error "This file is for HIP and OpenMP AMDGCN device compilation only."
#endif
#if !defined(__cplusplus)
#include <limits.h>
#ifdef __OPENMP_AMDGCN__
#define __DEVICE__ static inline __attribute__((always_inline, nothrow))
#else
#define __DEVICE__ static __device__ inline __attribute__((always_inline))
#endif
__DEVICE__
int abs(int __x) {
int __sgn = __x >> (sizeof(int) * CHAR_BIT - 1);
return (__x ^ __sgn) - __sgn;
}
__DEVICE__
long labs(long __x) {
long __sgn = __x >> (sizeof(long) * CHAR_BIT - 1);
return (__x ^ __sgn) - __sgn;
}
__DEVICE__
long long llabs(long long __x) {
long long __sgn = __x >> (sizeof(long long) * CHAR_BIT - 1);
return (__x ^ __sgn) - __sgn;
}
#endif // !defined(__cplusplus)
#endif // #define __CLANG_HIP_STDLIB_H__
+14 -14
View File
@@ -17323,32 +17323,32 @@ provided.
#define vec_ncipherlast_be __builtin_altivec_crypto_vncipherlast
#ifdef __VSX__
static __inline__ vector unsigned long long __attribute__((__always_inline__))
__builtin_crypto_vsbox(vector unsigned long long __a) {
static __inline__ vector unsigned char __attribute__((__always_inline__))
__builtin_crypto_vsbox(vector unsigned char __a) {
return __builtin_altivec_crypto_vsbox(__a);
}
static __inline__ vector unsigned long long __attribute__((__always_inline__))
__builtin_crypto_vcipher(vector unsigned long long __a,
vector unsigned long long __b) {
static __inline__ vector unsigned char __attribute__((__always_inline__))
__builtin_crypto_vcipher(vector unsigned char __a,
vector unsigned char __b) {
return __builtin_altivec_crypto_vcipher(__a, __b);
}
static __inline__ vector unsigned long long __attribute__((__always_inline__))
__builtin_crypto_vcipherlast(vector unsigned long long __a,
vector unsigned long long __b) {
static __inline__ vector unsigned char __attribute__((__always_inline__))
__builtin_crypto_vcipherlast(vector unsigned char __a,
vector unsigned char __b) {
return __builtin_altivec_crypto_vcipherlast(__a, __b);
}
static __inline__ vector unsigned long long __attribute__((__always_inline__))
__builtin_crypto_vncipher(vector unsigned long long __a,
vector unsigned long long __b) {
static __inline__ vector unsigned char __attribute__((__always_inline__))
__builtin_crypto_vncipher(vector unsigned char __a,
vector unsigned char __b) {
return __builtin_altivec_crypto_vncipher(__a, __b);
}
static __inline__ vector unsigned long long __attribute__((__always_inline__))
__builtin_crypto_vncipherlast(vector unsigned long long __a,
vector unsigned long long __b) {
static __inline__ vector unsigned char __attribute__((__always_inline__))
__builtin_crypto_vncipherlast(vector unsigned char __a,
vector unsigned char __b) {
return __builtin_altivec_crypto_vncipherlast(__a, __b);
}
#endif /* __VSX__ */
+58
View File
@@ -0,0 +1,58 @@
/*===------------- amxfp16intrin.h - AMX_FP16 intrinsics -*- C++ -*---------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===------------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <amxfp16intrin.h> directly; use <immintrin.h> instead."
#endif /* __IMMINTRIN_H */
#ifndef __AMX_FP16INTRIN_H
#define __AMX_FP16INTRIN_H
#ifdef __x86_64__
/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles \a a
/// and \a b, accumulating the intermediate single-precision (32-bit)
/// floating-point elements with elements in \a dst, and store the 32-bit
/// result back to tile \a dst.
///
/// \headerfile <immintrin.h>
///
/// \code
/// void _tile_dpfp16ps (__tile dst, __tile a, __tile b)
/// \endcode
///
/// \code{.operation}
/// FOR m := 0 TO dst.rows - 1
/// tmp := dst.row[m]
/// FOR k := 0 TO (a.colsb / 4) - 1
/// FOR n := 0 TO (dst.colsb / 4) - 1
/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) *
/// FP32(b.row[k].fp16[2*n+0])
/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) *
/// FP32(b.row[k].fp16[2*n+1])
/// ENDFOR
/// ENDFOR
/// write_row_and_zero(dst, m, tmp, dst.colsb)
/// ENDFOR
/// zero_upper_rows(dst, dst.rows)
/// zero_tileconfig_start()
/// \endcode
///
/// This intrinsic corresponds to the \c TDPFP16PS instruction.
///
/// \param dst
/// The destination tile. Max size is 1024 Bytes.
/// \param a
/// The 1st source tile. Max size is 1024 Bytes.
/// \param b
/// The 2nd source tile. Max size is 1024 Bytes.
#define _tile_dpfp16ps(dst, a, b) \
__builtin_ia32_tdpfp16ps(dst, a, b)
#endif /* __x86_64__ */
#endif /* __AMX_FP16INTRIN_H */
+32
View File
@@ -22,6 +22,8 @@
__attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
#define __DEFAULT_FN_ATTRS_BF16 \
__attribute__((__always_inline__, __nodebug__, __target__("amx-bf16")))
#define __DEFAULT_FN_ATTRS_FP16 \
__attribute__((__always_inline__, __nodebug__, __target__("amx-fp16")))
/// Load tile configuration from a 64-byte memory location specified by
/// "mem_addr". The tile configuration includes the tile type palette, the
@@ -290,6 +292,13 @@ _tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
}
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP16
_tile_dpfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
return __builtin_ia32_tdpfp16ps_internal(m, n, k, dst, src1, src2);
}
/// This struct pack the shape and tile data together for user. We suggest
/// initializing the struct as early as possible, because compiler depends
/// on the shape information to do configure. The constant value is preferred
@@ -484,9 +493,32 @@ static __inline__ void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
src0.tile, src1.tile);
}
/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles src0 and
/// src1, accumulating the intermediate single-precision (32-bit) floating-point
/// elements with elements in "dst", and store the 32-bit result back to tile
/// "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TDPFP16PS </c> instruction.
///
/// \param dst
/// The destination tile. Max size is 1024 Bytes.
/// \param src0
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_FP16
static __inline__ void __tile_dpfp16ps(__tile1024i *dst, __tile1024i src0,
__tile1024i src1) {
dst->tile = _tile_dpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile,
src0.tile, src1.tile);
}
#undef __DEFAULT_FN_ATTRS_TILE
#undef __DEFAULT_FN_ATTRS_INT8
#undef __DEFAULT_FN_ATTRS_BF16
#undef __DEFAULT_FN_ATTRS_FP16
#endif /* __x86_64__ */
#endif /* __AMXINTRIN_H */
+76 -75
View File
@@ -64,7 +64,7 @@ static __inline__ void __attribute__((__always_inline__, __nodebug__)) __yield(v
}
#endif
#if __ARM_32BIT_STATE
#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
#define __dbg(t) __builtin_arm_dbg(t)
#endif
@@ -82,7 +82,7 @@ __swp(uint32_t __x, volatile uint32_t *__p) {
/* 8.6.1 Data prefetch */
#define __pld(addr) __pldx(0, 0, 0, addr)
#if __ARM_32BIT_STATE
#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
#define __pldx(access_kind, cache_level, retention_policy, addr) \
__builtin_arm_prefetch(addr, access_kind, 1)
#else
@@ -93,7 +93,7 @@ __swp(uint32_t __x, volatile uint32_t *__p) {
/* 8.6.2 Instruction prefetch */
#define __pli(addr) __plix(0, 0, addr)
#if __ARM_32BIT_STATE
#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
#define __plix(cache_level, retention_policy, addr) \
__builtin_arm_prefetch(addr, 0, 0)
#else
@@ -140,17 +140,17 @@ __rorl(unsigned long __x, uint32_t __y) {
/* CLZ */
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
__clz(uint32_t __t) {
return __builtin_clz(__t);
return (uint32_t)__builtin_clz(__t);
}
static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
__clzl(unsigned long __t) {
return __builtin_clzl(__t);
return (unsigned long)__builtin_clzl(__t);
}
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
__clzll(uint64_t __t) {
return __builtin_clzll(__t);
return (uint64_t)__builtin_clzll(__t);
}
/* CLS */
@@ -201,7 +201,7 @@ __rev16(uint32_t __t) {
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
__rev16ll(uint64_t __t) {
return (((uint64_t)__rev16(__t >> 32)) << 32) | __rev16(__t);
return (((uint64_t)__rev16(__t >> 32)) << 32) | (uint64_t)__rev16((uint32_t)__t);
}
static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
@@ -216,7 +216,7 @@ __rev16l(unsigned long __t) {
/* REVSH */
static __inline__ int16_t __attribute__((__always_inline__, __nodebug__))
__revsh(int16_t __t) {
return __builtin_bswap16(__t);
return (int16_t)__builtin_bswap16((uint16_t)__t);
}
/* RBIT */
@@ -227,7 +227,7 @@ __rbit(uint32_t __t) {
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
__rbitll(uint64_t __t) {
#if __ARM_32BIT_STATE
#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
return (((uint64_t)__builtin_arm_rbit(__t)) << 32) |
__builtin_arm_rbit(__t >> 32);
#else
@@ -247,7 +247,7 @@ __rbitl(unsigned long __t) {
/*
* 9.3 16-bit multiplications
*/
#if __ARM_FEATURE_DSP
#if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
__smulbb(int32_t __a, int32_t __b) {
return __builtin_arm_smulbb(__a, __b);
@@ -277,17 +277,17 @@ __smulwt(int32_t __a, int32_t __b) {
/*
* 9.4 Saturating intrinsics
*
* FIXME: Change guard to their corrosponding __ARM_FEATURE flag when Q flag
* FIXME: Change guard to their corresponding __ARM_FEATURE flag when Q flag
* intrinsics are implemented and the flag is enabled.
*/
/* 9.4.1 Width-specified saturation intrinsics */
#if __ARM_FEATURE_SAT
#if defined(__ARM_FEATURE_SAT) && __ARM_FEATURE_SAT
#define __ssat(x, y) __builtin_arm_ssat(x, y)
#define __usat(x, y) __builtin_arm_usat(x, y)
#endif
/* 9.4.2 Saturating addition and subtraction intrinsics */
#if __ARM_FEATURE_DSP
#if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__qadd(int32_t __t, int32_t __v) {
return __builtin_arm_qadd(__t, __v);
@@ -305,7 +305,7 @@ __qdbl(int32_t __t) {
#endif
/* 9.4.3 Accumultating multiplications */
#if __ARM_FEATURE_DSP
#if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__smlabb(int32_t __a, int32_t __b, int32_t __c) {
return __builtin_arm_smlabb(__a, __b, __c);
@@ -334,13 +334,13 @@ __smlawt(int32_t __a, int32_t __b, int32_t __c) {
/* 9.5.4 Parallel 16-bit saturation */
#if __ARM_FEATURE_SIMD32
#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
#define __ssat16(x, y) __builtin_arm_ssat16(x, y)
#define __usat16(x, y) __builtin_arm_usat16(x, y)
#endif
/* 9.5.5 Packing and unpacking */
#if __ARM_FEATURE_SIMD32
#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
typedef int32_t int8x4_t;
typedef int32_t int16x2_t;
typedef uint32_t uint8x4_t;
@@ -365,7 +365,7 @@ __uxtb16(int8x4_t __a) {
#endif
/* 9.5.6 Parallel selection */
#if __ARM_FEATURE_SIMD32
#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
__sel(uint8x4_t __a, uint8x4_t __b) {
return __builtin_arm_sel(__a, __b);
@@ -373,7 +373,7 @@ __sel(uint8x4_t __a, uint8x4_t __b) {
#endif
/* 9.5.7 Parallel 8-bit addition and subtraction */
#if __ARM_FEATURE_SIMD32
#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
__qadd8(int8x4_t __a, int8x4_t __b) {
return __builtin_arm_qadd8(__a, __b);
@@ -425,7 +425,7 @@ __usub8(uint8x4_t __a, uint8x4_t __b) {
#endif
/* 9.5.8 Sum of 8-bit absolute differences */
#if __ARM_FEATURE_SIMD32
#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
__usad8(uint8x4_t __a, uint8x4_t __b) {
return __builtin_arm_usad8(__a, __b);
@@ -437,7 +437,7 @@ __usada8(uint8x4_t __a, uint8x4_t __b, uint32_t __c) {
#endif
/* 9.5.9 Parallel 16-bit addition and subtraction */
#if __ARM_FEATURE_SIMD32
#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__qadd16(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_qadd16(__a, __b);
@@ -537,7 +537,7 @@ __usub16(uint16x2_t __a, uint16x2_t __b) {
#endif
/* 9.5.10 Parallel 16-bit multiplications */
#if __ARM_FEATURE_SIMD32
#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__smlad(int16x2_t __a, int16x2_t __b, int32_t __c) {
return __builtin_arm_smlad(__a, __b, __c);
@@ -589,155 +589,156 @@ __smusdx(int16x2_t __a, int16x2_t __b) {
#endif
/* 9.7 CRC32 intrinsics */
#if __ARM_FEATURE_CRC32
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
#if (defined(__ARM_FEATURE_CRC32) && __ARM_FEATURE_CRC32) || \
(defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE)
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
__crc32b(uint32_t __a, uint8_t __b) {
return __builtin_arm_crc32b(__a, __b);
}
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
__crc32h(uint32_t __a, uint16_t __b) {
return __builtin_arm_crc32h(__a, __b);
}
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
__crc32w(uint32_t __a, uint32_t __b) {
return __builtin_arm_crc32w(__a, __b);
}
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
__crc32d(uint32_t __a, uint64_t __b) {
return __builtin_arm_crc32d(__a, __b);
}
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
__crc32cb(uint32_t __a, uint8_t __b) {
return __builtin_arm_crc32cb(__a, __b);
}
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
__crc32ch(uint32_t __a, uint16_t __b) {
return __builtin_arm_crc32ch(__a, __b);
}
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
__crc32cw(uint32_t __a, uint32_t __b) {
return __builtin_arm_crc32cw(__a, __b);
}
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
__crc32cd(uint32_t __a, uint64_t __b) {
return __builtin_arm_crc32cd(__a, __b);
}
#endif
/* Armv8.3-A Javascript conversion intrinsic */
#if __ARM_64BIT_STATE && defined(__ARM_FEATURE_JCVT)
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("v8.3a")))
__jcvt(double __a) {
return __builtin_arm_jcvt(__a);
}
#endif
/* Armv8.5-A FP rounding intrinsics */
#if __ARM_64BIT_STATE && defined(__ARM_FEATURE_FRINT)
static __inline__ float __attribute__((__always_inline__, __nodebug__))
__frint32zf(float __a) {
return __builtin_arm_frint32zf(__a);
#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
__rint32zf(float __a) {
return __builtin_arm_rint32zf(__a);
}
static __inline__ double __attribute__((__always_inline__, __nodebug__))
__frint32z(double __a) {
return __builtin_arm_frint32z(__a);
static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
__rint32z(double __a) {
return __builtin_arm_rint32z(__a);
}
static __inline__ float __attribute__((__always_inline__, __nodebug__))
__frint64zf(float __a) {
return __builtin_arm_frint64zf(__a);
static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
__rint64zf(float __a) {
return __builtin_arm_rint64zf(__a);
}
static __inline__ double __attribute__((__always_inline__, __nodebug__))
__frint64z(double __a) {
return __builtin_arm_frint64z(__a);
static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
__rint64z(double __a) {
return __builtin_arm_rint64z(__a);
}
static __inline__ float __attribute__((__always_inline__, __nodebug__))
__frint32xf(float __a) {
return __builtin_arm_frint32xf(__a);
static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
__rint32xf(float __a) {
return __builtin_arm_rint32xf(__a);
}
static __inline__ double __attribute__((__always_inline__, __nodebug__))
__frint32x(double __a) {
return __builtin_arm_frint32x(__a);
static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
__rint32x(double __a) {
return __builtin_arm_rint32x(__a);
}
static __inline__ float __attribute__((__always_inline__, __nodebug__))
__frint64xf(float __a) {
return __builtin_arm_frint64xf(__a);
static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
__rint64xf(float __a) {
return __builtin_arm_rint64xf(__a);
}
static __inline__ double __attribute__((__always_inline__, __nodebug__))
__frint64x(double __a) {
return __builtin_arm_frint64x(__a);
static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
__rint64x(double __a) {
return __builtin_arm_rint64x(__a);
}
#endif
/* Armv8.7-A load/store 64-byte intrinsics */
#if __ARM_64BIT_STATE && defined(__ARM_FEATURE_LS64)
#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
typedef struct {
uint64_t val[8];
} data512_t;
static __inline__ data512_t __attribute__((__always_inline__, __nodebug__))
static __inline__ data512_t __attribute__((__always_inline__, __nodebug__, target("ls64")))
__arm_ld64b(const void *__addr) {
data512_t __value;
__builtin_arm_ld64b(__addr, __value.val);
return __value;
data512_t __value;
__builtin_arm_ld64b(__addr, __value.val);
return __value;
}
static __inline__ void __attribute__((__always_inline__, __nodebug__))
static __inline__ void __attribute__((__always_inline__, __nodebug__, target("ls64")))
__arm_st64b(void *__addr, data512_t __value) {
__builtin_arm_st64b(__addr, __value.val);
__builtin_arm_st64b(__addr, __value.val);
}
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__, target("ls64")))
__arm_st64bv(void *__addr, data512_t __value) {
return __builtin_arm_st64bv(__addr, __value.val);
return __builtin_arm_st64bv(__addr, __value.val);
}
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__, target("ls64")))
__arm_st64bv0(void *__addr, data512_t __value) {
return __builtin_arm_st64bv0(__addr, __value.val);
return __builtin_arm_st64bv0(__addr, __value.val);
}
#endif
/* 10.1 Special register intrinsics */
#define __arm_rsr(sysreg) __builtin_arm_rsr(sysreg)
#define __arm_rsr64(sysreg) __builtin_arm_rsr64(sysreg)
#define __arm_rsr128(sysreg) __builtin_arm_rsr128(sysreg)
#define __arm_rsrp(sysreg) __builtin_arm_rsrp(sysreg)
#define __arm_rsrf(sysreg) __builtin_bit_cast(float, __arm_rsr(sysreg))
#define __arm_rsrf64(sysreg) __builtin_bit_cast(double, __arm_rsr64(sysreg))
#define __arm_wsr(sysreg, v) __builtin_arm_wsr(sysreg, v)
#define __arm_wsr64(sysreg, v) __builtin_arm_wsr64(sysreg, v)
#define __arm_wsr128(sysreg, v) __builtin_arm_wsr128(sysreg, v)
#define __arm_wsrp(sysreg, v) __builtin_arm_wsrp(sysreg, v)
#define __arm_wsrf(sysreg, v) __arm_wsr(sysreg, __builtin_bit_cast(uint32_t, v))
#define __arm_wsrf64(sysreg, v) __arm_wsr64(sysreg, __builtin_bit_cast(uint64_t, v))
/* Memory Tagging Extensions (MTE) Intrinsics */
#if __ARM_FEATURE_MEMORY_TAGGING
#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
#define __arm_mte_create_random_tag(__ptr, __mask) __builtin_arm_irg(__ptr, __mask)
#define __arm_mte_increment_tag(__ptr, __tag_offset) __builtin_arm_addg(__ptr, __tag_offset)
#define __arm_mte_exclude_tag(__ptr, __excluded) __builtin_arm_gmi(__ptr, __excluded)
#define __arm_mte_get_tag(__ptr) __builtin_arm_ldg(__ptr)
#define __arm_mte_set_tag(__ptr) __builtin_arm_stg(__ptr)
#define __arm_mte_ptrdiff(__ptra, __ptrb) __builtin_arm_subp(__ptra, __ptrb)
#endif
/* Memory Operations Intrinsics */
#if __ARM_FEATURE_MOPS && __ARM_FEATURE_MEMORY_TAGGING
#define __arm_mops_memset_tag(__tagged_address, __value, __size) \
__builtin_arm_mops_memset_tag(__tagged_address, __value, __size)
#endif
/* Transactional Memory Extension (TME) Intrinsics */
#if __ARM_FEATURE_TME
#if defined(__ARM_FEATURE_TME) && __ARM_FEATURE_TME
#define _TMFAILURE_REASON 0x00007fffu
#define _TMFAILURE_RTRY 0x00008000u
@@ -759,12 +760,12 @@ __arm_st64bv0(void *__addr, data512_t __value) {
#endif /* __ARM_FEATURE_TME */
/* Armv8.5-A Random number generation intrinsics */
#if __ARM_64BIT_STATE && defined(__ARM_FEATURE_RNG)
static __inline__ int __attribute__((__always_inline__, __nodebug__))
#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
static __inline__ int __attribute__((__always_inline__, __nodebug__, target("rand")))
__rndr(uint64_t *__p) {
return __builtin_arm_rndr(__p);
}
static __inline__ int __attribute__((__always_inline__, __nodebug__))
static __inline__ int __attribute__((__always_inline__, __nodebug__, target("rand")))
__rndrrs(uint64_t *__p) {
return __builtin_arm_rndrrs(__p);
}
+1 -1
View File
@@ -29,7 +29,7 @@
typedef __fp16 float16_t;
#define __ai static __inline__ __attribute__((__always_inline__, __nodebug__))
#if defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) && defined(__aarch64__)
#if defined(__aarch64__)
#define vabdh_f16(__p0, __p1) __extension__ ({ \
float16_t __ret; \
float16_t __s0 = __p0; \
+17505 -17675
View File
File diff suppressed because it is too large Load Diff
+182
View File
@@ -0,0 +1,182 @@
/*===---- arm_neon_sve_bridge.h - ARM NEON SVE Bridge intrinsics -----------===
*
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __ARM_NEON_SVE_BRIDGE_H
#define __ARM_NEON_SVE_BRIDGE_H
#include <arm_neon.h>
#include <arm_sve.h>
#ifdef __cplusplus
extern "C" {
#endif
/* Function attributes */
#define __ai static __inline__ __attribute__((__always_inline__, __nodebug__))
#define __aio \
static __inline__ \
__attribute__((__always_inline__, __nodebug__, __overloadable__))
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s8)))
svint8_t svset_neonq(svint8_t, int8x16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s16)))
svint16_t svset_neonq(svint16_t, int16x8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s32)))
svint32_t svset_neonq(svint32_t, int32x4_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s64)))
svint64_t svset_neonq(svint64_t, int64x2_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u8)))
svuint8_t svset_neonq(svuint8_t, uint8x16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u16)))
svuint16_t svset_neonq(svuint16_t, uint16x8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u32)))
svuint32_t svset_neonq(svuint32_t, uint32x4_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u64)))
svuint64_t svset_neonq(svuint64_t, uint64x2_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f16)))
svfloat16_t svset_neonq(svfloat16_t, float16x8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f32)))
svfloat32_t svset_neonq(svfloat32_t, float32x4_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f64)))
svfloat64_t svset_neonq(svfloat64_t, float64x2_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s8)))
svint8_t svset_neonq_s8(svint8_t, int8x16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s16)))
svint16_t svset_neonq_s16(svint16_t, int16x8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s32)))
svint32_t svset_neonq_s32(svint32_t, int32x4_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s64)))
svint64_t svset_neonq_s64(svint64_t, int64x2_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u8)))
svuint8_t svset_neonq_u8(svuint8_t, uint8x16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u16)))
svuint16_t svset_neonq_u16(svuint16_t, uint16x8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u32)))
svuint32_t svset_neonq_u32(svuint32_t, uint32x4_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u64)))
svuint64_t svset_neonq_u64(svuint64_t, uint64x2_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f16)))
svfloat16_t svset_neonq_f16(svfloat16_t, float16x8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f32)))
svfloat32_t svset_neonq_f32(svfloat32_t, float32x4_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f64)))
svfloat64_t svset_neonq_f64(svfloat64_t, float64x2_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s8)))
int8x16_t svget_neonq(svint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s16)))
int16x8_t svget_neonq(svint16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s32)))
int32x4_t svget_neonq(svint32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s64)))
int64x2_t svget_neonq(svint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u8)))
uint8x16_t svget_neonq(svuint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u16)))
uint16x8_t svget_neonq(svuint16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u32)))
uint32x4_t svget_neonq(svuint32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u64)))
uint64x2_t svget_neonq(svuint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f16)))
float16x8_t svget_neonq(svfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f32)))
float32x4_t svget_neonq(svfloat32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f64)))
float64x2_t svget_neonq(svfloat64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s8)))
int8x16_t svget_neonq_s8(svint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s16)))
int16x8_t svget_neonq_s16(svint16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s32)))
int32x4_t svget_neonq_s32(svint32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s64)))
int64x2_t svget_neonq_s64(svint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u8)))
uint8x16_t svget_neonq_u8(svuint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u16)))
uint16x8_t svget_neonq_u16(svuint16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u32)))
uint32x4_t svget_neonq_u32(svuint32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u64)))
uint64x2_t svget_neonq_u64(svuint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f16)))
float16x8_t svget_neonq_f16(svfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f32)))
float32x4_t svget_neonq_f32(svfloat32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f64)))
float64x2_t svget_neonq_f64(svfloat64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s8)))
svint8_t svdup_neonq(int8x16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s16)))
svint16_t svdup_neonq(int16x8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s32)))
svint32_t svdup_neonq(int32x4_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s64)))
svint64_t svdup_neonq(int64x2_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u8)))
svuint8_t svdup_neonq(uint8x16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u16)))
svuint16_t svdup_neonq(uint16x8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u32)))
svuint32_t svdup_neonq(uint32x4_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u64)))
svuint64_t svdup_neonq(uint64x2_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f16)))
svfloat16_t svdup_neonq(float16x8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f32)))
svfloat32_t svdup_neonq(float32x4_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f64)))
svfloat64_t svdup_neonq(float64x2_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s8)))
svint8_t svdup_neonq_s8(int8x16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s16)))
svint16_t svdup_neonq_s16(int16x8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s32)))
svint32_t svdup_neonq_s32(int32x4_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s64)))
svint64_t svdup_neonq_s64(int64x2_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u8)))
svuint8_t svdup_neonq_u8(uint8x16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u16)))
svuint16_t svdup_neonq_u16(uint16x8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u32)))
svuint32_t svdup_neonq_u32(uint32x4_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u64)))
svuint64_t svdup_neonq_u64(uint64x2_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f16)))
svfloat16_t svdup_neonq_f16(float16x8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f32)))
svfloat32_t svdup_neonq_f32(float32x4_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f64)))
svfloat64_t svdup_neonq_f64(float64x2_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_bf16)))
svbfloat16_t svset_neonq(svbfloat16_t, bfloat16x8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_bf16)))
svbfloat16_t svset_neonq_bf16(svbfloat16_t, bfloat16x8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_bf16)))
bfloat16x8_t svget_neonq(svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_bf16)))
bfloat16x8_t svget_neonq_bf16(svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_bf16)))
svbfloat16_t svdup_neonq(bfloat16x8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_bf16)))
svbfloat16_t svdup_neonq_bf16(bfloat16x8_t);
#undef __ai
#undef __aio
#ifdef __cplusplus
} // extern "C"
#endif
#endif //__ARM_NEON_SVE_BRIDGE_H
+944 -1096
View File
@@ -11,10 +11,6 @@
#ifndef __ARM_SVE_H
#define __ARM_SVE_H
#if !defined(__ARM_FEATURE_SVE)
#error "SVE support not enabled"
#else
#if !defined(__LITTLE_ENDIAN__)
#error "Big endian is currently not supported for arm_sve.h"
#endif
@@ -39,19 +35,9 @@ typedef __SVUint32_t svuint32_t;
typedef __SVUint64_t svuint64_t;
typedef __SVFloat16_t svfloat16_t;
#if defined(__ARM_FEATURE_SVE_BF16) && !defined(__ARM_FEATURE_BF16_SCALAR_ARITHMETIC)
#error "__ARM_FEATURE_BF16_SCALAR_ARITHMETIC must be defined when __ARM_FEATURE_SVE_BF16 is defined"
#endif
#if defined(__ARM_FEATURE_SVE_BF16)
typedef __SVBFloat16_t svbfloat16_t;
#endif
#if defined(__ARM_FEATURE_BF16_SCALAR_ARITHMETIC)
#include <arm_bf16.h>
typedef __bf16 bfloat16_t;
#endif
typedef __SVFloat32_t svfloat32_t;
typedef __SVFloat64_t svfloat64_t;
typedef __clang_svint8x2_t svint8x2_t;
@@ -89,11 +75,9 @@ typedef __clang_svfloat32x4_t svfloat32x4_t;
typedef __clang_svfloat64x4_t svfloat64x4_t;
typedef __SVBool_t svbool_t;
#ifdef __ARM_FEATURE_SVE_BF16
typedef __clang_svbfloat16x2_t svbfloat16x2_t;
typedef __clang_svbfloat16x3_t svbfloat16x3_t;
typedef __clang_svbfloat16x4_t svbfloat16x4_t;
#endif
enum svpattern
{
SV_POW2 = 0,
@@ -145,9 +129,7 @@ enum svprfop
#define svreinterpret_s8_u32(...) __builtin_sve_reinterpret_s8_u32(__VA_ARGS__)
#define svreinterpret_s8_u64(...) __builtin_sve_reinterpret_s8_u64(__VA_ARGS__)
#define svreinterpret_s8_f16(...) __builtin_sve_reinterpret_s8_f16(__VA_ARGS__)
#if defined(__ARM_FEATURE_SVE_BF16)
#define svreinterpret_s8_bf16(...) __builtin_sve_reinterpret_s8_bf16(__VA_ARGS__)
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
#define svreinterpret_s8_f32(...) __builtin_sve_reinterpret_s8_f32(__VA_ARGS__)
#define svreinterpret_s8_f64(...) __builtin_sve_reinterpret_s8_f64(__VA_ARGS__)
#define svreinterpret_s16_s8(...) __builtin_sve_reinterpret_s16_s8(__VA_ARGS__)
@@ -159,9 +141,7 @@ enum svprfop
#define svreinterpret_s16_u32(...) __builtin_sve_reinterpret_s16_u32(__VA_ARGS__)
#define svreinterpret_s16_u64(...) __builtin_sve_reinterpret_s16_u64(__VA_ARGS__)
#define svreinterpret_s16_f16(...) __builtin_sve_reinterpret_s16_f16(__VA_ARGS__)
#if defined(__ARM_FEATURE_SVE_BF16)
#define svreinterpret_s16_bf16(...) __builtin_sve_reinterpret_s16_bf16(__VA_ARGS__)
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
#define svreinterpret_s16_f32(...) __builtin_sve_reinterpret_s16_f32(__VA_ARGS__)
#define svreinterpret_s16_f64(...) __builtin_sve_reinterpret_s16_f64(__VA_ARGS__)
#define svreinterpret_s32_s8(...) __builtin_sve_reinterpret_s32_s8(__VA_ARGS__)
@@ -173,9 +153,7 @@ enum svprfop
#define svreinterpret_s32_u32(...) __builtin_sve_reinterpret_s32_u32(__VA_ARGS__)
#define svreinterpret_s32_u64(...) __builtin_sve_reinterpret_s32_u64(__VA_ARGS__)
#define svreinterpret_s32_f16(...) __builtin_sve_reinterpret_s32_f16(__VA_ARGS__)
#if defined(__ARM_FEATURE_SVE_BF16)
#define svreinterpret_s32_bf16(...) __builtin_sve_reinterpret_s32_bf16(__VA_ARGS__)
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
#define svreinterpret_s32_f32(...) __builtin_sve_reinterpret_s32_f32(__VA_ARGS__)
#define svreinterpret_s32_f64(...) __builtin_sve_reinterpret_s32_f64(__VA_ARGS__)
#define svreinterpret_s64_s8(...) __builtin_sve_reinterpret_s64_s8(__VA_ARGS__)
@@ -187,9 +165,7 @@ enum svprfop
#define svreinterpret_s64_u32(...) __builtin_sve_reinterpret_s64_u32(__VA_ARGS__)
#define svreinterpret_s64_u64(...) __builtin_sve_reinterpret_s64_u64(__VA_ARGS__)
#define svreinterpret_s64_f16(...) __builtin_sve_reinterpret_s64_f16(__VA_ARGS__)
#if defined(__ARM_FEATURE_SVE_BF16)
#define svreinterpret_s64_bf16(...) __builtin_sve_reinterpret_s64_bf16(__VA_ARGS__)
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
#define svreinterpret_s64_f32(...) __builtin_sve_reinterpret_s64_f32(__VA_ARGS__)
#define svreinterpret_s64_f64(...) __builtin_sve_reinterpret_s64_f64(__VA_ARGS__)
#define svreinterpret_u8_s8(...) __builtin_sve_reinterpret_u8_s8(__VA_ARGS__)
@@ -201,9 +177,7 @@ enum svprfop
#define svreinterpret_u8_u32(...) __builtin_sve_reinterpret_u8_u32(__VA_ARGS__)
#define svreinterpret_u8_u64(...) __builtin_sve_reinterpret_u8_u64(__VA_ARGS__)
#define svreinterpret_u8_f16(...) __builtin_sve_reinterpret_u8_f16(__VA_ARGS__)
#if defined(__ARM_FEATURE_SVE_BF16)
#define svreinterpret_u8_bf16(...) __builtin_sve_reinterpret_u8_bf16(__VA_ARGS__)
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
#define svreinterpret_u8_f32(...) __builtin_sve_reinterpret_u8_f32(__VA_ARGS__)
#define svreinterpret_u8_f64(...) __builtin_sve_reinterpret_u8_f64(__VA_ARGS__)
#define svreinterpret_u16_s8(...) __builtin_sve_reinterpret_u16_s8(__VA_ARGS__)
@@ -215,9 +189,7 @@ enum svprfop
#define svreinterpret_u16_u32(...) __builtin_sve_reinterpret_u16_u32(__VA_ARGS__)
#define svreinterpret_u16_u64(...) __builtin_sve_reinterpret_u16_u64(__VA_ARGS__)
#define svreinterpret_u16_f16(...) __builtin_sve_reinterpret_u16_f16(__VA_ARGS__)
#if defined(__ARM_FEATURE_SVE_BF16)
#define svreinterpret_u16_bf16(...) __builtin_sve_reinterpret_u16_bf16(__VA_ARGS__)
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
#define svreinterpret_u16_f32(...) __builtin_sve_reinterpret_u16_f32(__VA_ARGS__)
#define svreinterpret_u16_f64(...) __builtin_sve_reinterpret_u16_f64(__VA_ARGS__)
#define svreinterpret_u32_s8(...) __builtin_sve_reinterpret_u32_s8(__VA_ARGS__)
@@ -229,9 +201,7 @@ enum svprfop
#define svreinterpret_u32_u32(...) __builtin_sve_reinterpret_u32_u32(__VA_ARGS__)
#define svreinterpret_u32_u64(...) __builtin_sve_reinterpret_u32_u64(__VA_ARGS__)
#define svreinterpret_u32_f16(...) __builtin_sve_reinterpret_u32_f16(__VA_ARGS__)
#if defined(__ARM_FEATURE_SVE_BF16)
#define svreinterpret_u32_bf16(...) __builtin_sve_reinterpret_u32_bf16(__VA_ARGS__)
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
#define svreinterpret_u32_f32(...) __builtin_sve_reinterpret_u32_f32(__VA_ARGS__)
#define svreinterpret_u32_f64(...) __builtin_sve_reinterpret_u32_f64(__VA_ARGS__)
#define svreinterpret_u64_s8(...) __builtin_sve_reinterpret_u64_s8(__VA_ARGS__)
@@ -243,9 +213,7 @@ enum svprfop
#define svreinterpret_u64_u32(...) __builtin_sve_reinterpret_u64_u32(__VA_ARGS__)
#define svreinterpret_u64_u64(...) __builtin_sve_reinterpret_u64_u64(__VA_ARGS__)
#define svreinterpret_u64_f16(...) __builtin_sve_reinterpret_u64_f16(__VA_ARGS__)
#if defined(__ARM_FEATURE_SVE_BF16)
#define svreinterpret_u64_bf16(...) __builtin_sve_reinterpret_u64_bf16(__VA_ARGS__)
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
#define svreinterpret_u64_f32(...) __builtin_sve_reinterpret_u64_f32(__VA_ARGS__)
#define svreinterpret_u64_f64(...) __builtin_sve_reinterpret_u64_f64(__VA_ARGS__)
#define svreinterpret_f16_s8(...) __builtin_sve_reinterpret_f16_s8(__VA_ARGS__)
@@ -257,47 +225,21 @@ enum svprfop
#define svreinterpret_f16_u32(...) __builtin_sve_reinterpret_f16_u32(__VA_ARGS__)
#define svreinterpret_f16_u64(...) __builtin_sve_reinterpret_f16_u64(__VA_ARGS__)
#define svreinterpret_f16_f16(...) __builtin_sve_reinterpret_f16_f16(__VA_ARGS__)
#if defined(__ARM_FEATURE_SVE_BF16)
#define svreinterpret_f16_bf16(...) __builtin_sve_reinterpret_f16_bf16(__VA_ARGS__)
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
#define svreinterpret_f16_f32(...) __builtin_sve_reinterpret_f16_f32(__VA_ARGS__)
#define svreinterpret_f16_f64(...) __builtin_sve_reinterpret_f16_f64(__VA_ARGS__)
#if defined(__ARM_FEATURE_SVE_BF16)
#define svreinterpret_bf16_s8(...) __builtin_sve_reinterpret_bf16_s8(__VA_ARGS__)
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
#if defined(__ARM_FEATURE_SVE_BF16)
#define svreinterpret_bf16_s16(...) __builtin_sve_reinterpret_bf16_s16(__VA_ARGS__)
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
#if defined(__ARM_FEATURE_SVE_BF16)
#define svreinterpret_bf16_s32(...) __builtin_sve_reinterpret_bf16_s32(__VA_ARGS__)
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
#if defined(__ARM_FEATURE_SVE_BF16)
#define svreinterpret_bf16_s64(...) __builtin_sve_reinterpret_bf16_s64(__VA_ARGS__)
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
#if defined(__ARM_FEATURE_SVE_BF16)
#define svreinterpret_bf16_u8(...) __builtin_sve_reinterpret_bf16_u8(__VA_ARGS__)
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
#if defined(__ARM_FEATURE_SVE_BF16)
#define svreinterpret_bf16_u16(...) __builtin_sve_reinterpret_bf16_u16(__VA_ARGS__)
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
#if defined(__ARM_FEATURE_SVE_BF16)
#define svreinterpret_bf16_u32(...) __builtin_sve_reinterpret_bf16_u32(__VA_ARGS__)
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
#if defined(__ARM_FEATURE_SVE_BF16)
#define svreinterpret_bf16_u64(...) __builtin_sve_reinterpret_bf16_u64(__VA_ARGS__)
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
#if defined(__ARM_FEATURE_SVE_BF16)
#define svreinterpret_bf16_f16(...) __builtin_sve_reinterpret_bf16_f16(__VA_ARGS__)
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
#if defined(__ARM_FEATURE_SVE_BF16)
#define svreinterpret_bf16_bf16(...) __builtin_sve_reinterpret_bf16_bf16(__VA_ARGS__)
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
#if defined(__ARM_FEATURE_SVE_BF16)
#define svreinterpret_bf16_f32(...) __builtin_sve_reinterpret_bf16_f32(__VA_ARGS__)
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
#if defined(__ARM_FEATURE_SVE_BF16)
#define svreinterpret_bf16_f64(...) __builtin_sve_reinterpret_bf16_f64(__VA_ARGS__)
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
#define svreinterpret_f32_s8(...) __builtin_sve_reinterpret_f32_s8(__VA_ARGS__)
#define svreinterpret_f32_s16(...) __builtin_sve_reinterpret_f32_s16(__VA_ARGS__)
#define svreinterpret_f32_s32(...) __builtin_sve_reinterpret_f32_s32(__VA_ARGS__)
@@ -307,9 +249,7 @@ enum svprfop
#define svreinterpret_f32_u32(...) __builtin_sve_reinterpret_f32_u32(__VA_ARGS__)
#define svreinterpret_f32_u64(...) __builtin_sve_reinterpret_f32_u64(__VA_ARGS__)
#define svreinterpret_f32_f16(...) __builtin_sve_reinterpret_f32_f16(__VA_ARGS__)
#if defined(__ARM_FEATURE_SVE_BF16)
#define svreinterpret_f32_bf16(...) __builtin_sve_reinterpret_f32_bf16(__VA_ARGS__)
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
#define svreinterpret_f32_f32(...) __builtin_sve_reinterpret_f32_f32(__VA_ARGS__)
#define svreinterpret_f32_f64(...) __builtin_sve_reinterpret_f32_f64(__VA_ARGS__)
#define svreinterpret_f64_s8(...) __builtin_sve_reinterpret_f64_s8(__VA_ARGS__)
@@ -321,630 +261,582 @@ enum svprfop
#define svreinterpret_f64_u32(...) __builtin_sve_reinterpret_f64_u32(__VA_ARGS__)
#define svreinterpret_f64_u64(...) __builtin_sve_reinterpret_f64_u64(__VA_ARGS__)
#define svreinterpret_f64_f16(...) __builtin_sve_reinterpret_f64_f16(__VA_ARGS__)
#if defined(__ARM_FEATURE_SVE_BF16)
#define svreinterpret_f64_bf16(...) __builtin_sve_reinterpret_f64_bf16(__VA_ARGS__)
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
#define svreinterpret_f64_f32(...) __builtin_sve_reinterpret_f64_f32(__VA_ARGS__)
#define svreinterpret_f64_f64(...) __builtin_sve_reinterpret_f64_f64(__VA_ARGS__)
__aio svint8_t svreinterpret_s8(svint8_t op) {
__aio __attribute__((target("sve"))) svint8_t svreinterpret_s8(svint8_t op) {
return __builtin_sve_reinterpret_s8_s8(op);
}
__aio svint8_t svreinterpret_s8(svint16_t op) {
__aio __attribute__((target("sve"))) svint8_t svreinterpret_s8(svint16_t op) {
return __builtin_sve_reinterpret_s8_s16(op);
}
__aio svint8_t svreinterpret_s8(svint32_t op) {
__aio __attribute__((target("sve"))) svint8_t svreinterpret_s8(svint32_t op) {
return __builtin_sve_reinterpret_s8_s32(op);
}
__aio svint8_t svreinterpret_s8(svint64_t op) {
__aio __attribute__((target("sve"))) svint8_t svreinterpret_s8(svint64_t op) {
return __builtin_sve_reinterpret_s8_s64(op);
}
__aio svint8_t svreinterpret_s8(svuint8_t op) {
__aio __attribute__((target("sve"))) svint8_t svreinterpret_s8(svuint8_t op) {
return __builtin_sve_reinterpret_s8_u8(op);
}
__aio svint8_t svreinterpret_s8(svuint16_t op) {
__aio __attribute__((target("sve"))) svint8_t svreinterpret_s8(svuint16_t op) {
return __builtin_sve_reinterpret_s8_u16(op);
}
__aio svint8_t svreinterpret_s8(svuint32_t op) {
__aio __attribute__((target("sve"))) svint8_t svreinterpret_s8(svuint32_t op) {
return __builtin_sve_reinterpret_s8_u32(op);
}
__aio svint8_t svreinterpret_s8(svuint64_t op) {
__aio __attribute__((target("sve"))) svint8_t svreinterpret_s8(svuint64_t op) {
return __builtin_sve_reinterpret_s8_u64(op);
}
__aio svint8_t svreinterpret_s8(svfloat16_t op) {
__aio __attribute__((target("sve"))) svint8_t svreinterpret_s8(svfloat16_t op) {
return __builtin_sve_reinterpret_s8_f16(op);
}
#if defined(__ARM_FEATURE_SVE_BF16)
__aio svint8_t svreinterpret_s8(svbfloat16_t op) {
__aio __attribute__((target("sve"))) svint8_t svreinterpret_s8(svbfloat16_t op) {
return __builtin_sve_reinterpret_s8_bf16(op);
}
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
__aio svint8_t svreinterpret_s8(svfloat32_t op) {
__aio __attribute__((target("sve"))) svint8_t svreinterpret_s8(svfloat32_t op) {
return __builtin_sve_reinterpret_s8_f32(op);
}
__aio svint8_t svreinterpret_s8(svfloat64_t op) {
__aio __attribute__((target("sve"))) svint8_t svreinterpret_s8(svfloat64_t op) {
return __builtin_sve_reinterpret_s8_f64(op);
}
__aio svint16_t svreinterpret_s16(svint8_t op) {
__aio __attribute__((target("sve"))) svint16_t svreinterpret_s16(svint8_t op) {
return __builtin_sve_reinterpret_s16_s8(op);
}
__aio svint16_t svreinterpret_s16(svint16_t op) {
__aio __attribute__((target("sve"))) svint16_t svreinterpret_s16(svint16_t op) {
return __builtin_sve_reinterpret_s16_s16(op);
}
__aio svint16_t svreinterpret_s16(svint32_t op) {
__aio __attribute__((target("sve"))) svint16_t svreinterpret_s16(svint32_t op) {
return __builtin_sve_reinterpret_s16_s32(op);
}
__aio svint16_t svreinterpret_s16(svint64_t op) {
__aio __attribute__((target("sve"))) svint16_t svreinterpret_s16(svint64_t op) {
return __builtin_sve_reinterpret_s16_s64(op);
}
__aio svint16_t svreinterpret_s16(svuint8_t op) {
__aio __attribute__((target("sve"))) svint16_t svreinterpret_s16(svuint8_t op) {
return __builtin_sve_reinterpret_s16_u8(op);
}
__aio svint16_t svreinterpret_s16(svuint16_t op) {
__aio __attribute__((target("sve"))) svint16_t svreinterpret_s16(svuint16_t op) {
return __builtin_sve_reinterpret_s16_u16(op);
}
__aio svint16_t svreinterpret_s16(svuint32_t op) {
__aio __attribute__((target("sve"))) svint16_t svreinterpret_s16(svuint32_t op) {
return __builtin_sve_reinterpret_s16_u32(op);
}
__aio svint16_t svreinterpret_s16(svuint64_t op) {
__aio __attribute__((target("sve"))) svint16_t svreinterpret_s16(svuint64_t op) {
return __builtin_sve_reinterpret_s16_u64(op);
}
__aio svint16_t svreinterpret_s16(svfloat16_t op) {
__aio __attribute__((target("sve"))) svint16_t svreinterpret_s16(svfloat16_t op) {
return __builtin_sve_reinterpret_s16_f16(op);
}
#if defined(__ARM_FEATURE_SVE_BF16)
__aio svint16_t svreinterpret_s16(svbfloat16_t op) {
__aio __attribute__((target("sve"))) svint16_t svreinterpret_s16(svbfloat16_t op) {
return __builtin_sve_reinterpret_s16_bf16(op);
}
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
__aio svint16_t svreinterpret_s16(svfloat32_t op) {
__aio __attribute__((target("sve"))) svint16_t svreinterpret_s16(svfloat32_t op) {
return __builtin_sve_reinterpret_s16_f32(op);
}
__aio svint16_t svreinterpret_s16(svfloat64_t op) {
__aio __attribute__((target("sve"))) svint16_t svreinterpret_s16(svfloat64_t op) {
return __builtin_sve_reinterpret_s16_f64(op);
}
__aio svint32_t svreinterpret_s32(svint8_t op) {
__aio __attribute__((target("sve"))) svint32_t svreinterpret_s32(svint8_t op) {
return __builtin_sve_reinterpret_s32_s8(op);
}
__aio svint32_t svreinterpret_s32(svint16_t op) {
__aio __attribute__((target("sve"))) svint32_t svreinterpret_s32(svint16_t op) {
return __builtin_sve_reinterpret_s32_s16(op);
}
__aio svint32_t svreinterpret_s32(svint32_t op) {
__aio __attribute__((target("sve"))) svint32_t svreinterpret_s32(svint32_t op) {
return __builtin_sve_reinterpret_s32_s32(op);
}
__aio svint32_t svreinterpret_s32(svint64_t op) {
__aio __attribute__((target("sve"))) svint32_t svreinterpret_s32(svint64_t op) {
return __builtin_sve_reinterpret_s32_s64(op);
}
__aio svint32_t svreinterpret_s32(svuint8_t op) {
__aio __attribute__((target("sve"))) svint32_t svreinterpret_s32(svuint8_t op) {
return __builtin_sve_reinterpret_s32_u8(op);
}
__aio svint32_t svreinterpret_s32(svuint16_t op) {
__aio __attribute__((target("sve"))) svint32_t svreinterpret_s32(svuint16_t op) {
return __builtin_sve_reinterpret_s32_u16(op);
}
__aio svint32_t svreinterpret_s32(svuint32_t op) {
__aio __attribute__((target("sve"))) svint32_t svreinterpret_s32(svuint32_t op) {
return __builtin_sve_reinterpret_s32_u32(op);
}
__aio svint32_t svreinterpret_s32(svuint64_t op) {
__aio __attribute__((target("sve"))) svint32_t svreinterpret_s32(svuint64_t op) {
return __builtin_sve_reinterpret_s32_u64(op);
}
__aio svint32_t svreinterpret_s32(svfloat16_t op) {
__aio __attribute__((target("sve"))) svint32_t svreinterpret_s32(svfloat16_t op) {
return __builtin_sve_reinterpret_s32_f16(op);
}
#if defined(__ARM_FEATURE_SVE_BF16)
__aio svint32_t svreinterpret_s32(svbfloat16_t op) {
__aio __attribute__((target("sve"))) svint32_t svreinterpret_s32(svbfloat16_t op) {
return __builtin_sve_reinterpret_s32_bf16(op);
}
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
__aio svint32_t svreinterpret_s32(svfloat32_t op) {
__aio __attribute__((target("sve"))) svint32_t svreinterpret_s32(svfloat32_t op) {
return __builtin_sve_reinterpret_s32_f32(op);
}
__aio svint32_t svreinterpret_s32(svfloat64_t op) {
__aio __attribute__((target("sve"))) svint32_t svreinterpret_s32(svfloat64_t op) {
return __builtin_sve_reinterpret_s32_f64(op);
}
__aio svint64_t svreinterpret_s64(svint8_t op) {
__aio __attribute__((target("sve"))) svint64_t svreinterpret_s64(svint8_t op) {
return __builtin_sve_reinterpret_s64_s8(op);
}
__aio svint64_t svreinterpret_s64(svint16_t op) {
__aio __attribute__((target("sve"))) svint64_t svreinterpret_s64(svint16_t op) {
return __builtin_sve_reinterpret_s64_s16(op);
}
__aio svint64_t svreinterpret_s64(svint32_t op) {
__aio __attribute__((target("sve"))) svint64_t svreinterpret_s64(svint32_t op) {
return __builtin_sve_reinterpret_s64_s32(op);
}
__aio svint64_t svreinterpret_s64(svint64_t op) {
__aio __attribute__((target("sve"))) svint64_t svreinterpret_s64(svint64_t op) {
return __builtin_sve_reinterpret_s64_s64(op);
}
__aio svint64_t svreinterpret_s64(svuint8_t op) {
__aio __attribute__((target("sve"))) svint64_t svreinterpret_s64(svuint8_t op) {
return __builtin_sve_reinterpret_s64_u8(op);
}
__aio svint64_t svreinterpret_s64(svuint16_t op) {
__aio __attribute__((target("sve"))) svint64_t svreinterpret_s64(svuint16_t op) {
return __builtin_sve_reinterpret_s64_u16(op);
}
__aio svint64_t svreinterpret_s64(svuint32_t op) {
__aio __attribute__((target("sve"))) svint64_t svreinterpret_s64(svuint32_t op) {
return __builtin_sve_reinterpret_s64_u32(op);
}
__aio svint64_t svreinterpret_s64(svuint64_t op) {
__aio __attribute__((target("sve"))) svint64_t svreinterpret_s64(svuint64_t op) {
return __builtin_sve_reinterpret_s64_u64(op);
}
__aio svint64_t svreinterpret_s64(svfloat16_t op) {
__aio __attribute__((target("sve"))) svint64_t svreinterpret_s64(svfloat16_t op) {
return __builtin_sve_reinterpret_s64_f16(op);
}
#if defined(__ARM_FEATURE_SVE_BF16)
__aio svint64_t svreinterpret_s64(svbfloat16_t op) {
__aio __attribute__((target("sve"))) svint64_t svreinterpret_s64(svbfloat16_t op) {
return __builtin_sve_reinterpret_s64_bf16(op);
}
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
__aio svint64_t svreinterpret_s64(svfloat32_t op) {
__aio __attribute__((target("sve"))) svint64_t svreinterpret_s64(svfloat32_t op) {
return __builtin_sve_reinterpret_s64_f32(op);
}
__aio svint64_t svreinterpret_s64(svfloat64_t op) {
__aio __attribute__((target("sve"))) svint64_t svreinterpret_s64(svfloat64_t op) {
return __builtin_sve_reinterpret_s64_f64(op);
}
__aio svuint8_t svreinterpret_u8(svint8_t op) {
__aio __attribute__((target("sve"))) svuint8_t svreinterpret_u8(svint8_t op) {
return __builtin_sve_reinterpret_u8_s8(op);
}
__aio svuint8_t svreinterpret_u8(svint16_t op) {
__aio __attribute__((target("sve"))) svuint8_t svreinterpret_u8(svint16_t op) {
return __builtin_sve_reinterpret_u8_s16(op);
}
__aio svuint8_t svreinterpret_u8(svint32_t op) {
__aio __attribute__((target("sve"))) svuint8_t svreinterpret_u8(svint32_t op) {
return __builtin_sve_reinterpret_u8_s32(op);
}
__aio svuint8_t svreinterpret_u8(svint64_t op) {
__aio __attribute__((target("sve"))) svuint8_t svreinterpret_u8(svint64_t op) {
return __builtin_sve_reinterpret_u8_s64(op);
}
__aio svuint8_t svreinterpret_u8(svuint8_t op) {
__aio __attribute__((target("sve"))) svuint8_t svreinterpret_u8(svuint8_t op) {
return __builtin_sve_reinterpret_u8_u8(op);
}
__aio svuint8_t svreinterpret_u8(svuint16_t op) {
__aio __attribute__((target("sve"))) svuint8_t svreinterpret_u8(svuint16_t op) {
return __builtin_sve_reinterpret_u8_u16(op);
}
__aio svuint8_t svreinterpret_u8(svuint32_t op) {
__aio __attribute__((target("sve"))) svuint8_t svreinterpret_u8(svuint32_t op) {
return __builtin_sve_reinterpret_u8_u32(op);
}
__aio svuint8_t svreinterpret_u8(svuint64_t op) {
__aio __attribute__((target("sve"))) svuint8_t svreinterpret_u8(svuint64_t op) {
return __builtin_sve_reinterpret_u8_u64(op);
}
__aio svuint8_t svreinterpret_u8(svfloat16_t op) {
__aio __attribute__((target("sve"))) svuint8_t svreinterpret_u8(svfloat16_t op) {
return __builtin_sve_reinterpret_u8_f16(op);
}
#if defined(__ARM_FEATURE_SVE_BF16)
__aio svuint8_t svreinterpret_u8(svbfloat16_t op) {
__aio __attribute__((target("sve"))) svuint8_t svreinterpret_u8(svbfloat16_t op) {
return __builtin_sve_reinterpret_u8_bf16(op);
}
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
__aio svuint8_t svreinterpret_u8(svfloat32_t op) {
__aio __attribute__((target("sve"))) svuint8_t svreinterpret_u8(svfloat32_t op) {
return __builtin_sve_reinterpret_u8_f32(op);
}
__aio svuint8_t svreinterpret_u8(svfloat64_t op) {
__aio __attribute__((target("sve"))) svuint8_t svreinterpret_u8(svfloat64_t op) {
return __builtin_sve_reinterpret_u8_f64(op);
}
__aio svuint16_t svreinterpret_u16(svint8_t op) {
__aio __attribute__((target("sve"))) svuint16_t svreinterpret_u16(svint8_t op) {
return __builtin_sve_reinterpret_u16_s8(op);
}
__aio svuint16_t svreinterpret_u16(svint16_t op) {
__aio __attribute__((target("sve"))) svuint16_t svreinterpret_u16(svint16_t op) {
return __builtin_sve_reinterpret_u16_s16(op);
}
__aio svuint16_t svreinterpret_u16(svint32_t op) {
__aio __attribute__((target("sve"))) svuint16_t svreinterpret_u16(svint32_t op) {
return __builtin_sve_reinterpret_u16_s32(op);
}
__aio svuint16_t svreinterpret_u16(svint64_t op) {
__aio __attribute__((target("sve"))) svuint16_t svreinterpret_u16(svint64_t op) {
return __builtin_sve_reinterpret_u16_s64(op);
}
__aio svuint16_t svreinterpret_u16(svuint8_t op) {
__aio __attribute__((target("sve"))) svuint16_t svreinterpret_u16(svuint8_t op) {
return __builtin_sve_reinterpret_u16_u8(op);
}
__aio svuint16_t svreinterpret_u16(svuint16_t op) {
__aio __attribute__((target("sve"))) svuint16_t svreinterpret_u16(svuint16_t op) {
return __builtin_sve_reinterpret_u16_u16(op);
}
__aio svuint16_t svreinterpret_u16(svuint32_t op) {
__aio __attribute__((target("sve"))) svuint16_t svreinterpret_u16(svuint32_t op) {
return __builtin_sve_reinterpret_u16_u32(op);
}
__aio svuint16_t svreinterpret_u16(svuint64_t op) {
__aio __attribute__((target("sve"))) svuint16_t svreinterpret_u16(svuint64_t op) {
return __builtin_sve_reinterpret_u16_u64(op);
}
__aio svuint16_t svreinterpret_u16(svfloat16_t op) {
__aio __attribute__((target("sve"))) svuint16_t svreinterpret_u16(svfloat16_t op) {
return __builtin_sve_reinterpret_u16_f16(op);
}
#if defined(__ARM_FEATURE_SVE_BF16)
__aio svuint16_t svreinterpret_u16(svbfloat16_t op) {
__aio __attribute__((target("sve"))) svuint16_t svreinterpret_u16(svbfloat16_t op) {
return __builtin_sve_reinterpret_u16_bf16(op);
}
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
__aio svuint16_t svreinterpret_u16(svfloat32_t op) {
__aio __attribute__((target("sve"))) svuint16_t svreinterpret_u16(svfloat32_t op) {
return __builtin_sve_reinterpret_u16_f32(op);
}
__aio svuint16_t svreinterpret_u16(svfloat64_t op) {
__aio __attribute__((target("sve"))) svuint16_t svreinterpret_u16(svfloat64_t op) {
return __builtin_sve_reinterpret_u16_f64(op);
}
__aio svuint32_t svreinterpret_u32(svint8_t op) {
__aio __attribute__((target("sve"))) svuint32_t svreinterpret_u32(svint8_t op) {
return __builtin_sve_reinterpret_u32_s8(op);
}
__aio svuint32_t svreinterpret_u32(svint16_t op) {
__aio __attribute__((target("sve"))) svuint32_t svreinterpret_u32(svint16_t op) {
return __builtin_sve_reinterpret_u32_s16(op);
}
__aio svuint32_t svreinterpret_u32(svint32_t op) {
__aio __attribute__((target("sve"))) svuint32_t svreinterpret_u32(svint32_t op) {
return __builtin_sve_reinterpret_u32_s32(op);
}
__aio svuint32_t svreinterpret_u32(svint64_t op) {
__aio __attribute__((target("sve"))) svuint32_t svreinterpret_u32(svint64_t op) {
return __builtin_sve_reinterpret_u32_s64(op);
}
__aio svuint32_t svreinterpret_u32(svuint8_t op) {
__aio __attribute__((target("sve"))) svuint32_t svreinterpret_u32(svuint8_t op) {
return __builtin_sve_reinterpret_u32_u8(op);
}
__aio svuint32_t svreinterpret_u32(svuint16_t op) {
__aio __attribute__((target("sve"))) svuint32_t svreinterpret_u32(svuint16_t op) {
return __builtin_sve_reinterpret_u32_u16(op);
}
__aio svuint32_t svreinterpret_u32(svuint32_t op) {
__aio __attribute__((target("sve"))) svuint32_t svreinterpret_u32(svuint32_t op) {
return __builtin_sve_reinterpret_u32_u32(op);
}
__aio svuint32_t svreinterpret_u32(svuint64_t op) {
__aio __attribute__((target("sve"))) svuint32_t svreinterpret_u32(svuint64_t op) {
return __builtin_sve_reinterpret_u32_u64(op);
}
__aio svuint32_t svreinterpret_u32(svfloat16_t op) {
__aio __attribute__((target("sve"))) svuint32_t svreinterpret_u32(svfloat16_t op) {
return __builtin_sve_reinterpret_u32_f16(op);
}
#if defined(__ARM_FEATURE_SVE_BF16)
__aio svuint32_t svreinterpret_u32(svbfloat16_t op) {
__aio __attribute__((target("sve"))) svuint32_t svreinterpret_u32(svbfloat16_t op) {
return __builtin_sve_reinterpret_u32_bf16(op);
}
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
__aio svuint32_t svreinterpret_u32(svfloat32_t op) {
__aio __attribute__((target("sve"))) svuint32_t svreinterpret_u32(svfloat32_t op) {
return __builtin_sve_reinterpret_u32_f32(op);
}
__aio svuint32_t svreinterpret_u32(svfloat64_t op) {
__aio __attribute__((target("sve"))) svuint32_t svreinterpret_u32(svfloat64_t op) {
return __builtin_sve_reinterpret_u32_f64(op);
}
__aio svuint64_t svreinterpret_u64(svint8_t op) {
__aio __attribute__((target("sve"))) svuint64_t svreinterpret_u64(svint8_t op) {
return __builtin_sve_reinterpret_u64_s8(op);
}
__aio svuint64_t svreinterpret_u64(svint16_t op) {
__aio __attribute__((target("sve"))) svuint64_t svreinterpret_u64(svint16_t op) {
return __builtin_sve_reinterpret_u64_s16(op);
}
__aio svuint64_t svreinterpret_u64(svint32_t op) {
__aio __attribute__((target("sve"))) svuint64_t svreinterpret_u64(svint32_t op) {
return __builtin_sve_reinterpret_u64_s32(op);
}
__aio svuint64_t svreinterpret_u64(svint64_t op) {
__aio __attribute__((target("sve"))) svuint64_t svreinterpret_u64(svint64_t op) {
return __builtin_sve_reinterpret_u64_s64(op);
}
__aio svuint64_t svreinterpret_u64(svuint8_t op) {
__aio __attribute__((target("sve"))) svuint64_t svreinterpret_u64(svuint8_t op) {
return __builtin_sve_reinterpret_u64_u8(op);
}
__aio svuint64_t svreinterpret_u64(svuint16_t op) {
__aio __attribute__((target("sve"))) svuint64_t svreinterpret_u64(svuint16_t op) {
return __builtin_sve_reinterpret_u64_u16(op);
}
__aio svuint64_t svreinterpret_u64(svuint32_t op) {
__aio __attribute__((target("sve"))) svuint64_t svreinterpret_u64(svuint32_t op) {
return __builtin_sve_reinterpret_u64_u32(op);
}
__aio svuint64_t svreinterpret_u64(svuint64_t op) {
__aio __attribute__((target("sve"))) svuint64_t svreinterpret_u64(svuint64_t op) {
return __builtin_sve_reinterpret_u64_u64(op);
}
__aio svuint64_t svreinterpret_u64(svfloat16_t op) {
__aio __attribute__((target("sve"))) svuint64_t svreinterpret_u64(svfloat16_t op) {
return __builtin_sve_reinterpret_u64_f16(op);
}
#if defined(__ARM_FEATURE_SVE_BF16)
__aio svuint64_t svreinterpret_u64(svbfloat16_t op) {
__aio __attribute__((target("sve"))) svuint64_t svreinterpret_u64(svbfloat16_t op) {
return __builtin_sve_reinterpret_u64_bf16(op);
}
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
__aio svuint64_t svreinterpret_u64(svfloat32_t op) {
__aio __attribute__((target("sve"))) svuint64_t svreinterpret_u64(svfloat32_t op) {
return __builtin_sve_reinterpret_u64_f32(op);
}
__aio svuint64_t svreinterpret_u64(svfloat64_t op) {
__aio __attribute__((target("sve"))) svuint64_t svreinterpret_u64(svfloat64_t op) {
return __builtin_sve_reinterpret_u64_f64(op);
}
__aio svfloat16_t svreinterpret_f16(svint8_t op) {
__aio __attribute__((target("sve"))) svfloat16_t svreinterpret_f16(svint8_t op) {
return __builtin_sve_reinterpret_f16_s8(op);
}
__aio svfloat16_t svreinterpret_f16(svint16_t op) {
__aio __attribute__((target("sve"))) svfloat16_t svreinterpret_f16(svint16_t op) {
return __builtin_sve_reinterpret_f16_s16(op);
}
__aio svfloat16_t svreinterpret_f16(svint32_t op) {
__aio __attribute__((target("sve"))) svfloat16_t svreinterpret_f16(svint32_t op) {
return __builtin_sve_reinterpret_f16_s32(op);
}
__aio svfloat16_t svreinterpret_f16(svint64_t op) {
__aio __attribute__((target("sve"))) svfloat16_t svreinterpret_f16(svint64_t op) {
return __builtin_sve_reinterpret_f16_s64(op);
}
__aio svfloat16_t svreinterpret_f16(svuint8_t op) {
__aio __attribute__((target("sve"))) svfloat16_t svreinterpret_f16(svuint8_t op) {
return __builtin_sve_reinterpret_f16_u8(op);
}
__aio svfloat16_t svreinterpret_f16(svuint16_t op) {
__aio __attribute__((target("sve"))) svfloat16_t svreinterpret_f16(svuint16_t op) {
return __builtin_sve_reinterpret_f16_u16(op);
}
__aio svfloat16_t svreinterpret_f16(svuint32_t op) {
__aio __attribute__((target("sve"))) svfloat16_t svreinterpret_f16(svuint32_t op) {
return __builtin_sve_reinterpret_f16_u32(op);
}
__aio svfloat16_t svreinterpret_f16(svuint64_t op) {
__aio __attribute__((target("sve"))) svfloat16_t svreinterpret_f16(svuint64_t op) {
return __builtin_sve_reinterpret_f16_u64(op);
}
__aio svfloat16_t svreinterpret_f16(svfloat16_t op) {
__aio __attribute__((target("sve"))) svfloat16_t svreinterpret_f16(svfloat16_t op) {
return __builtin_sve_reinterpret_f16_f16(op);
}
#if defined(__ARM_FEATURE_SVE_BF16)
__aio svfloat16_t svreinterpret_f16(svbfloat16_t op) {
__aio __attribute__((target("sve"))) svfloat16_t svreinterpret_f16(svbfloat16_t op) {
return __builtin_sve_reinterpret_f16_bf16(op);
}
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
__aio svfloat16_t svreinterpret_f16(svfloat32_t op) {
__aio __attribute__((target("sve"))) svfloat16_t svreinterpret_f16(svfloat32_t op) {
return __builtin_sve_reinterpret_f16_f32(op);
}
__aio svfloat16_t svreinterpret_f16(svfloat64_t op) {
__aio __attribute__((target("sve"))) svfloat16_t svreinterpret_f16(svfloat64_t op) {
return __builtin_sve_reinterpret_f16_f64(op);
}
#if defined(__ARM_FEATURE_SVE_BF16)
__aio svbfloat16_t svreinterpret_bf16(svint8_t op) {
__aio __attribute__((target("sve"))) svbfloat16_t svreinterpret_bf16(svint8_t op) {
return __builtin_sve_reinterpret_bf16_s8(op);
}
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
#if defined(__ARM_FEATURE_SVE_BF16)
__aio svbfloat16_t svreinterpret_bf16(svint16_t op) {
__aio __attribute__((target("sve"))) svbfloat16_t svreinterpret_bf16(svint16_t op) {
return __builtin_sve_reinterpret_bf16_s16(op);
}
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
#if defined(__ARM_FEATURE_SVE_BF16)
__aio svbfloat16_t svreinterpret_bf16(svint32_t op) {
__aio __attribute__((target("sve"))) svbfloat16_t svreinterpret_bf16(svint32_t op) {
return __builtin_sve_reinterpret_bf16_s32(op);
}
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
#if defined(__ARM_FEATURE_SVE_BF16)
__aio svbfloat16_t svreinterpret_bf16(svint64_t op) {
__aio __attribute__((target("sve"))) svbfloat16_t svreinterpret_bf16(svint64_t op) {
return __builtin_sve_reinterpret_bf16_s64(op);
}
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
#if defined(__ARM_FEATURE_SVE_BF16)
__aio svbfloat16_t svreinterpret_bf16(svuint8_t op) {
__aio __attribute__((target("sve"))) svbfloat16_t svreinterpret_bf16(svuint8_t op) {
return __builtin_sve_reinterpret_bf16_u8(op);
}
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
#if defined(__ARM_FEATURE_SVE_BF16)
__aio svbfloat16_t svreinterpret_bf16(svuint16_t op) {
__aio __attribute__((target("sve"))) svbfloat16_t svreinterpret_bf16(svuint16_t op) {
return __builtin_sve_reinterpret_bf16_u16(op);
}
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
#if defined(__ARM_FEATURE_SVE_BF16)
__aio svbfloat16_t svreinterpret_bf16(svuint32_t op) {
__aio __attribute__((target("sve"))) svbfloat16_t svreinterpret_bf16(svuint32_t op) {
return __builtin_sve_reinterpret_bf16_u32(op);
}
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
#if defined(__ARM_FEATURE_SVE_BF16)
__aio svbfloat16_t svreinterpret_bf16(svuint64_t op) {
__aio __attribute__((target("sve"))) svbfloat16_t svreinterpret_bf16(svuint64_t op) {
return __builtin_sve_reinterpret_bf16_u64(op);
}
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
#if defined(__ARM_FEATURE_SVE_BF16)
__aio svbfloat16_t svreinterpret_bf16(svfloat16_t op) {
__aio __attribute__((target("sve"))) svbfloat16_t svreinterpret_bf16(svfloat16_t op) {
return __builtin_sve_reinterpret_bf16_f16(op);
}
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
#if defined(__ARM_FEATURE_SVE_BF16)
__aio svbfloat16_t svreinterpret_bf16(svbfloat16_t op) {
__aio __attribute__((target("sve"))) svbfloat16_t svreinterpret_bf16(svbfloat16_t op) {
return __builtin_sve_reinterpret_bf16_bf16(op);
}
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
#if defined(__ARM_FEATURE_SVE_BF16)
__aio svbfloat16_t svreinterpret_bf16(svfloat32_t op) {
__aio __attribute__((target("sve"))) svbfloat16_t svreinterpret_bf16(svfloat32_t op) {
return __builtin_sve_reinterpret_bf16_f32(op);
}
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
#if defined(__ARM_FEATURE_SVE_BF16)
__aio svbfloat16_t svreinterpret_bf16(svfloat64_t op) {
__aio __attribute__((target("sve"))) svbfloat16_t svreinterpret_bf16(svfloat64_t op) {
return __builtin_sve_reinterpret_bf16_f64(op);
}
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
__aio svfloat32_t svreinterpret_f32(svint8_t op) {
__aio __attribute__((target("sve"))) svfloat32_t svreinterpret_f32(svint8_t op) {
return __builtin_sve_reinterpret_f32_s8(op);
}
__aio svfloat32_t svreinterpret_f32(svint16_t op) {
__aio __attribute__((target("sve"))) svfloat32_t svreinterpret_f32(svint16_t op) {
return __builtin_sve_reinterpret_f32_s16(op);
}
__aio svfloat32_t svreinterpret_f32(svint32_t op) {
__aio __attribute__((target("sve"))) svfloat32_t svreinterpret_f32(svint32_t op) {
return __builtin_sve_reinterpret_f32_s32(op);
}
__aio svfloat32_t svreinterpret_f32(svint64_t op) {
__aio __attribute__((target("sve"))) svfloat32_t svreinterpret_f32(svint64_t op) {
return __builtin_sve_reinterpret_f32_s64(op);
}
__aio svfloat32_t svreinterpret_f32(svuint8_t op) {
__aio __attribute__((target("sve"))) svfloat32_t svreinterpret_f32(svuint8_t op) {
return __builtin_sve_reinterpret_f32_u8(op);
}
__aio svfloat32_t svreinterpret_f32(svuint16_t op) {
__aio __attribute__((target("sve"))) svfloat32_t svreinterpret_f32(svuint16_t op) {
return __builtin_sve_reinterpret_f32_u16(op);
}
__aio svfloat32_t svreinterpret_f32(svuint32_t op) {
__aio __attribute__((target("sve"))) svfloat32_t svreinterpret_f32(svuint32_t op) {
return __builtin_sve_reinterpret_f32_u32(op);
}
__aio svfloat32_t svreinterpret_f32(svuint64_t op) {
__aio __attribute__((target("sve"))) svfloat32_t svreinterpret_f32(svuint64_t op) {
return __builtin_sve_reinterpret_f32_u64(op);
}
__aio svfloat32_t svreinterpret_f32(svfloat16_t op) {
__aio __attribute__((target("sve"))) svfloat32_t svreinterpret_f32(svfloat16_t op) {
return __builtin_sve_reinterpret_f32_f16(op);
}
#if defined(__ARM_FEATURE_SVE_BF16)
__aio svfloat32_t svreinterpret_f32(svbfloat16_t op) {
__aio __attribute__((target("sve"))) svfloat32_t svreinterpret_f32(svbfloat16_t op) {
return __builtin_sve_reinterpret_f32_bf16(op);
}
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
__aio svfloat32_t svreinterpret_f32(svfloat32_t op) {
__aio __attribute__((target("sve"))) svfloat32_t svreinterpret_f32(svfloat32_t op) {
return __builtin_sve_reinterpret_f32_f32(op);
}
__aio svfloat32_t svreinterpret_f32(svfloat64_t op) {
__aio __attribute__((target("sve"))) svfloat32_t svreinterpret_f32(svfloat64_t op) {
return __builtin_sve_reinterpret_f32_f64(op);
}
__aio svfloat64_t svreinterpret_f64(svint8_t op) {
__aio __attribute__((target("sve"))) svfloat64_t svreinterpret_f64(svint8_t op) {
return __builtin_sve_reinterpret_f64_s8(op);
}
__aio svfloat64_t svreinterpret_f64(svint16_t op) {
__aio __attribute__((target("sve"))) svfloat64_t svreinterpret_f64(svint16_t op) {
return __builtin_sve_reinterpret_f64_s16(op);
}
__aio svfloat64_t svreinterpret_f64(svint32_t op) {
__aio __attribute__((target("sve"))) svfloat64_t svreinterpret_f64(svint32_t op) {
return __builtin_sve_reinterpret_f64_s32(op);
}
__aio svfloat64_t svreinterpret_f64(svint64_t op) {
__aio __attribute__((target("sve"))) svfloat64_t svreinterpret_f64(svint64_t op) {
return __builtin_sve_reinterpret_f64_s64(op);
}
__aio svfloat64_t svreinterpret_f64(svuint8_t op) {
__aio __attribute__((target("sve"))) svfloat64_t svreinterpret_f64(svuint8_t op) {
return __builtin_sve_reinterpret_f64_u8(op);
}
__aio svfloat64_t svreinterpret_f64(svuint16_t op) {
__aio __attribute__((target("sve"))) svfloat64_t svreinterpret_f64(svuint16_t op) {
return __builtin_sve_reinterpret_f64_u16(op);
}
__aio svfloat64_t svreinterpret_f64(svuint32_t op) {
__aio __attribute__((target("sve"))) svfloat64_t svreinterpret_f64(svuint32_t op) {
return __builtin_sve_reinterpret_f64_u32(op);
}
__aio svfloat64_t svreinterpret_f64(svuint64_t op) {
__aio __attribute__((target("sve"))) svfloat64_t svreinterpret_f64(svuint64_t op) {
return __builtin_sve_reinterpret_f64_u64(op);
}
__aio svfloat64_t svreinterpret_f64(svfloat16_t op) {
__aio __attribute__((target("sve"))) svfloat64_t svreinterpret_f64(svfloat16_t op) {
return __builtin_sve_reinterpret_f64_f16(op);
}
#if defined(__ARM_FEATURE_SVE_BF16)
__aio svfloat64_t svreinterpret_f64(svbfloat16_t op) {
__aio __attribute__((target("sve"))) svfloat64_t svreinterpret_f64(svbfloat16_t op) {
return __builtin_sve_reinterpret_f64_bf16(op);
}
#endif /* #if defined(__ARM_FEATURE_SVE_BF16) */
__aio svfloat64_t svreinterpret_f64(svfloat32_t op) {
__aio __attribute__((target("sve"))) svfloat64_t svreinterpret_f64(svfloat32_t op) {
return __builtin_sve_reinterpret_f64_f32(op);
}
__aio svfloat64_t svreinterpret_f64(svfloat64_t op) {
__aio __attribute__((target("sve"))) svfloat64_t svreinterpret_f64(svfloat64_t op) {
return __builtin_sve_reinterpret_f64_f64(op);
}
@@ -15698,107 +15590,702 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_s64)))
svint64_t svzip2(svint64_t, svint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_s16)))
svint16_t svzip2(svint16_t, svint16_t);
#if defined (__ARM_FEATURE_SVE2_BITPERM)
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_n_u8)))
svuint8_t svbdep_n_u8(svuint8_t, uint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_n_u32)))
svuint32_t svbdep_n_u32(svuint32_t, uint32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_n_u64)))
svuint64_t svbdep_n_u64(svuint64_t, uint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_n_u16)))
svuint16_t svbdep_n_u16(svuint16_t, uint16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_u8)))
svuint8_t svbdep_u8(svuint8_t, svuint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_u32)))
svuint32_t svbdep_u32(svuint32_t, svuint32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_u64)))
svuint64_t svbdep_u64(svuint64_t, svuint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_u16)))
svuint16_t svbdep_u16(svuint16_t, svuint16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_n_u8)))
svuint8_t svbext_n_u8(svuint8_t, uint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_n_u32)))
svuint32_t svbext_n_u32(svuint32_t, uint32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_n_u64)))
svuint64_t svbext_n_u64(svuint64_t, uint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_n_u16)))
svuint16_t svbext_n_u16(svuint16_t, uint16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_u8)))
svuint8_t svbext_u8(svuint8_t, svuint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_u32)))
svuint32_t svbext_u32(svuint32_t, svuint32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_u64)))
svuint64_t svbext_u64(svuint64_t, svuint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_u16)))
svuint16_t svbext_u16(svuint16_t, svuint16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_n_u8)))
svuint8_t svbgrp_n_u8(svuint8_t, uint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_n_u32)))
svuint32_t svbgrp_n_u32(svuint32_t, uint32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_n_u64)))
svuint64_t svbgrp_n_u64(svuint64_t, uint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_n_u16)))
svuint16_t svbgrp_n_u16(svuint16_t, uint16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_u8)))
svuint8_t svbgrp_u8(svuint8_t, svuint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_u32)))
svuint32_t svbgrp_u32(svuint32_t, svuint32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_u64)))
svuint64_t svbgrp_u64(svuint64_t, svuint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_u16)))
svuint16_t svbgrp_u16(svuint16_t, svuint16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_n_u8)))
svuint8_t svbdep(svuint8_t, uint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_n_u32)))
svuint32_t svbdep(svuint32_t, uint32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_n_u64)))
svuint64_t svbdep(svuint64_t, uint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_n_u16)))
svuint16_t svbdep(svuint16_t, uint16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_u8)))
svuint8_t svbdep(svuint8_t, svuint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_u32)))
svuint32_t svbdep(svuint32_t, svuint32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_u64)))
svuint64_t svbdep(svuint64_t, svuint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_u16)))
svuint16_t svbdep(svuint16_t, svuint16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_n_u8)))
svuint8_t svbext(svuint8_t, uint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_n_u32)))
svuint32_t svbext(svuint32_t, uint32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_n_u64)))
svuint64_t svbext(svuint64_t, uint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_n_u16)))
svuint16_t svbext(svuint16_t, uint16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_u8)))
svuint8_t svbext(svuint8_t, svuint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_u32)))
svuint32_t svbext(svuint32_t, svuint32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_u64)))
svuint64_t svbext(svuint64_t, svuint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_u16)))
svuint16_t svbext(svuint16_t, svuint16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_n_u8)))
svuint8_t svbgrp(svuint8_t, uint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_n_u32)))
svuint32_t svbgrp(svuint32_t, uint32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_n_u64)))
svuint64_t svbgrp(svuint64_t, uint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_n_u16)))
svuint16_t svbgrp(svuint16_t, uint16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_u8)))
svuint8_t svbgrp(svuint8_t, svuint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_u32)))
svuint32_t svbgrp(svuint32_t, svuint32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_u64)))
svuint64_t svbgrp(svuint64_t, svuint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_u16)))
svuint16_t svbgrp(svuint16_t, svuint16_t);
#endif //defined (__ARM_FEATURE_SVE2_BITPERM)
#if defined(__ARM_FEATURE_SVE2)
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfdot_n_f32)))
svfloat32_t svbfdot_n_f32(svfloat32_t, svbfloat16_t, bfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfdot_f32)))
svfloat32_t svbfdot_f32(svfloat32_t, svbfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfdot_lane_f32)))
svfloat32_t svbfdot_lane_f32(svfloat32_t, svbfloat16_t, svbfloat16_t, uint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmlalb_n_f32)))
svfloat32_t svbfmlalb_n_f32(svfloat32_t, svbfloat16_t, bfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmlalb_f32)))
svfloat32_t svbfmlalb_f32(svfloat32_t, svbfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmlalb_lane_f32)))
svfloat32_t svbfmlalb_lane_f32(svfloat32_t, svbfloat16_t, svbfloat16_t, uint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmlalt_n_f32)))
svfloat32_t svbfmlalt_n_f32(svfloat32_t, svbfloat16_t, bfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmlalt_f32)))
svfloat32_t svbfmlalt_f32(svfloat32_t, svbfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmlalt_lane_f32)))
svfloat32_t svbfmlalt_lane_f32(svfloat32_t, svbfloat16_t, svbfloat16_t, uint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmmla_f32)))
svfloat32_t svbfmmla_f32(svfloat32_t, svbfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_n_bf16)))
bfloat16_t svclasta_n_bf16(svbool_t, bfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_bf16)))
svbfloat16_t svclasta_bf16(svbool_t, svbfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_n_bf16)))
bfloat16_t svclastb_n_bf16(svbool_t, bfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_bf16)))
svbfloat16_t svclastb_bf16(svbool_t, svbfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_bf16_m)))
svuint16_t svcnt_bf16_m(svuint16_t, svbool_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_bf16_x)))
svuint16_t svcnt_bf16_x(svbool_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_bf16_z)))
svuint16_t svcnt_bf16_z(svbool_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate2_bf16)))
svbfloat16x2_t svcreate2_bf16(svbfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate3_bf16)))
svbfloat16x3_t svcreate3_bf16(svbfloat16_t, svbfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate4_bf16)))
svbfloat16x4_t svcreate4_bf16(svbfloat16_t, svbfloat16_t, svbfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_bf16_f32_m)))
svbfloat16_t svcvt_bf16_f32_m(svbfloat16_t, svbool_t, svfloat32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_bf16_f32_x)))
svbfloat16_t svcvt_bf16_f32_x(svbool_t, svfloat32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_bf16_f32_z)))
svbfloat16_t svcvt_bf16_f32_z(svbool_t, svfloat32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtnt_bf16_f32_m)))
svbfloat16_t svcvtnt_bf16_f32_m(svbfloat16_t, svbool_t, svfloat32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_bf16)))
svbfloat16_t svdup_n_bf16(bfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_bf16_m)))
svbfloat16_t svdup_n_bf16_m(svbfloat16_t, svbool_t, bfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_bf16_x)))
svbfloat16_t svdup_n_bf16_x(svbool_t, bfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_bf16_z)))
svbfloat16_t svdup_n_bf16_z(svbool_t, bfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_lane_bf16)))
svbfloat16_t svdup_lane_bf16(svbfloat16_t, uint16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_bf16)))
svbfloat16_t svdupq_n_bf16(bfloat16_t, bfloat16_t, bfloat16_t, bfloat16_t, bfloat16_t, bfloat16_t, bfloat16_t, bfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_lane_bf16)))
svbfloat16_t svdupq_lane_bf16(svbfloat16_t, uint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svext_bf16)))
svbfloat16_t svext_bf16(svbfloat16_t, svbfloat16_t, uint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget2_bf16)))
svbfloat16_t svget2_bf16(svbfloat16x2_t, uint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget3_bf16)))
svbfloat16_t svget3_bf16(svbfloat16x3_t, uint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget4_bf16)))
svbfloat16_t svget4_bf16(svbfloat16x4_t, uint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svinsr_n_bf16)))
svbfloat16_t svinsr_n_bf16(svbfloat16_t, bfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlasta_bf16)))
bfloat16_t svlasta_bf16(svbool_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlastb_bf16)))
bfloat16_t svlastb_bf16(svbool_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_bf16)))
svbfloat16_t svld1_bf16(svbool_t, bfloat16_t const *);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_bf16)))
svbfloat16_t svld1_vnum_bf16(svbool_t, bfloat16_t const *, int64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1rq_bf16)))
svbfloat16_t svld1rq_bf16(svbool_t, bfloat16_t const *);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_bf16)))
svbfloat16x2_t svld2_bf16(svbool_t, bfloat16_t const *);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_vnum_bf16)))
svbfloat16x2_t svld2_vnum_bf16(svbool_t, bfloat16_t const *, int64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_bf16)))
svbfloat16x3_t svld3_bf16(svbool_t, bfloat16_t const *);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_vnum_bf16)))
svbfloat16x3_t svld3_vnum_bf16(svbool_t, bfloat16_t const *, int64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_bf16)))
svbfloat16x4_t svld4_bf16(svbool_t, bfloat16_t const *);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_vnum_bf16)))
svbfloat16x4_t svld4_vnum_bf16(svbool_t, bfloat16_t const *, int64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_bf16)))
svbfloat16_t svldff1_bf16(svbool_t, bfloat16_t const *);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_vnum_bf16)))
svbfloat16_t svldff1_vnum_bf16(svbool_t, bfloat16_t const *, int64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_bf16)))
svbfloat16_t svldnf1_bf16(svbool_t, bfloat16_t const *);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_vnum_bf16)))
svbfloat16_t svldnf1_vnum_bf16(svbool_t, bfloat16_t const *, int64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_bf16)))
svbfloat16_t svldnt1_bf16(svbool_t, bfloat16_t const *);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_bf16)))
svbfloat16_t svldnt1_vnum_bf16(svbool_t, bfloat16_t const *, int64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlen_bf16)))
uint64_t svlen_bf16(svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_bf16)))
svbfloat16_t svrev_bf16(svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_bf16)))
svbfloat16_t svsel_bf16(svbool_t, svbfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset2_bf16)))
svbfloat16x2_t svset2_bf16(svbfloat16x2_t, uint64_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset3_bf16)))
svbfloat16x3_t svset3_bf16(svbfloat16x3_t, uint64_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset4_bf16)))
svbfloat16x4_t svset4_bf16(svbfloat16x4_t, uint64_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsplice_bf16)))
svbfloat16_t svsplice_bf16(svbool_t, svbfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_bf16)))
void svst1_bf16(svbool_t, bfloat16_t *, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_bf16)))
void svst1_vnum_bf16(svbool_t, bfloat16_t *, int64_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_bf16)))
void svst2_bf16(svbool_t, bfloat16_t *, svbfloat16x2_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_vnum_bf16)))
void svst2_vnum_bf16(svbool_t, bfloat16_t *, int64_t, svbfloat16x2_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_bf16)))
void svst3_bf16(svbool_t, bfloat16_t *, svbfloat16x3_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_vnum_bf16)))
void svst3_vnum_bf16(svbool_t, bfloat16_t *, int64_t, svbfloat16x3_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_bf16)))
void svst4_bf16(svbool_t, bfloat16_t *, svbfloat16x4_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_vnum_bf16)))
void svst4_vnum_bf16(svbool_t, bfloat16_t *, int64_t, svbfloat16x4_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_bf16)))
void svstnt1_bf16(svbool_t, bfloat16_t *, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_bf16)))
void svstnt1_vnum_bf16(svbool_t, bfloat16_t *, int64_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl_bf16)))
svbfloat16_t svtbl_bf16(svbfloat16_t, svuint16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_bf16)))
svbfloat16_t svtrn1_bf16(svbfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_bf16)))
svbfloat16_t svtrn2_bf16(svbfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_bf16)))
svbfloat16x2_t svundef2_bf16(void);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_bf16)))
svbfloat16x3_t svundef3_bf16(void);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_bf16)))
svbfloat16x4_t svundef4_bf16(void);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_bf16)))
svbfloat16_t svundef_bf16(void);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_bf16)))
svbfloat16_t svuzp1_bf16(svbfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_bf16)))
svbfloat16_t svuzp2_bf16(svbfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_bf16)))
svbfloat16_t svzip1_bf16(svbfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_bf16)))
svbfloat16_t svzip2_bf16(svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfdot_n_f32)))
svfloat32_t svbfdot(svfloat32_t, svbfloat16_t, bfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfdot_f32)))
svfloat32_t svbfdot(svfloat32_t, svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfdot_lane_f32)))
svfloat32_t svbfdot_lane(svfloat32_t, svbfloat16_t, svbfloat16_t, uint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmlalb_n_f32)))
svfloat32_t svbfmlalb(svfloat32_t, svbfloat16_t, bfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmlalb_f32)))
svfloat32_t svbfmlalb(svfloat32_t, svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmlalb_lane_f32)))
svfloat32_t svbfmlalb_lane(svfloat32_t, svbfloat16_t, svbfloat16_t, uint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmlalt_n_f32)))
svfloat32_t svbfmlalt(svfloat32_t, svbfloat16_t, bfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmlalt_f32)))
svfloat32_t svbfmlalt(svfloat32_t, svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmlalt_lane_f32)))
svfloat32_t svbfmlalt_lane(svfloat32_t, svbfloat16_t, svbfloat16_t, uint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmmla_f32)))
svfloat32_t svbfmmla(svfloat32_t, svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_n_bf16)))
bfloat16_t svclasta(svbool_t, bfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_bf16)))
svbfloat16_t svclasta(svbool_t, svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_n_bf16)))
bfloat16_t svclastb(svbool_t, bfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_bf16)))
svbfloat16_t svclastb(svbool_t, svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_bf16_m)))
svuint16_t svcnt_m(svuint16_t, svbool_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_bf16_x)))
svuint16_t svcnt_x(svbool_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_bf16_z)))
svuint16_t svcnt_z(svbool_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate2_bf16)))
svbfloat16x2_t svcreate2(svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate3_bf16)))
svbfloat16x3_t svcreate3(svbfloat16_t, svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate4_bf16)))
svbfloat16x4_t svcreate4(svbfloat16_t, svbfloat16_t, svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_bf16_f32_m)))
svbfloat16_t svcvt_bf16_m(svbfloat16_t, svbool_t, svfloat32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_bf16_f32_x)))
svbfloat16_t svcvt_bf16_x(svbool_t, svfloat32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_bf16_f32_z)))
svbfloat16_t svcvt_bf16_z(svbool_t, svfloat32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtnt_bf16_f32_m)))
svbfloat16_t svcvtnt_bf16_m(svbfloat16_t, svbool_t, svfloat32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_bf16)))
svbfloat16_t svdup_bf16(bfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_bf16_m)))
svbfloat16_t svdup_bf16_m(svbfloat16_t, svbool_t, bfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_bf16_x)))
svbfloat16_t svdup_bf16_x(svbool_t, bfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_bf16_z)))
svbfloat16_t svdup_bf16_z(svbool_t, bfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_lane_bf16)))
svbfloat16_t svdup_lane(svbfloat16_t, uint16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_bf16)))
svbfloat16_t svdupq_bf16(bfloat16_t, bfloat16_t, bfloat16_t, bfloat16_t, bfloat16_t, bfloat16_t, bfloat16_t, bfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_lane_bf16)))
svbfloat16_t svdupq_lane(svbfloat16_t, uint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svext_bf16)))
svbfloat16_t svext(svbfloat16_t, svbfloat16_t, uint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget2_bf16)))
svbfloat16_t svget2(svbfloat16x2_t, uint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget3_bf16)))
svbfloat16_t svget3(svbfloat16x3_t, uint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget4_bf16)))
svbfloat16_t svget4(svbfloat16x4_t, uint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svinsr_n_bf16)))
svbfloat16_t svinsr(svbfloat16_t, bfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlasta_bf16)))
bfloat16_t svlasta(svbool_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlastb_bf16)))
bfloat16_t svlastb(svbool_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_bf16)))
svbfloat16_t svld1(svbool_t, bfloat16_t const *);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_bf16)))
svbfloat16_t svld1_vnum(svbool_t, bfloat16_t const *, int64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1rq_bf16)))
svbfloat16_t svld1rq(svbool_t, bfloat16_t const *);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_bf16)))
svbfloat16x2_t svld2(svbool_t, bfloat16_t const *);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_vnum_bf16)))
svbfloat16x2_t svld2_vnum(svbool_t, bfloat16_t const *, int64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_bf16)))
svbfloat16x3_t svld3(svbool_t, bfloat16_t const *);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_vnum_bf16)))
svbfloat16x3_t svld3_vnum(svbool_t, bfloat16_t const *, int64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_bf16)))
svbfloat16x4_t svld4(svbool_t, bfloat16_t const *);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_vnum_bf16)))
svbfloat16x4_t svld4_vnum(svbool_t, bfloat16_t const *, int64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_bf16)))
svbfloat16_t svldff1(svbool_t, bfloat16_t const *);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_vnum_bf16)))
svbfloat16_t svldff1_vnum(svbool_t, bfloat16_t const *, int64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_bf16)))
svbfloat16_t svldnf1(svbool_t, bfloat16_t const *);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_vnum_bf16)))
svbfloat16_t svldnf1_vnum(svbool_t, bfloat16_t const *, int64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_bf16)))
svbfloat16_t svldnt1(svbool_t, bfloat16_t const *);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_bf16)))
svbfloat16_t svldnt1_vnum(svbool_t, bfloat16_t const *, int64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlen_bf16)))
uint64_t svlen(svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_bf16)))
svbfloat16_t svrev(svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_bf16)))
svbfloat16_t svsel(svbool_t, svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset2_bf16)))
svbfloat16x2_t svset2(svbfloat16x2_t, uint64_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset3_bf16)))
svbfloat16x3_t svset3(svbfloat16x3_t, uint64_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset4_bf16)))
svbfloat16x4_t svset4(svbfloat16x4_t, uint64_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsplice_bf16)))
svbfloat16_t svsplice(svbool_t, svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_bf16)))
void svst1(svbool_t, bfloat16_t *, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_bf16)))
void svst1_vnum(svbool_t, bfloat16_t *, int64_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_bf16)))
void svst2(svbool_t, bfloat16_t *, svbfloat16x2_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_vnum_bf16)))
void svst2_vnum(svbool_t, bfloat16_t *, int64_t, svbfloat16x2_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_bf16)))
void svst3(svbool_t, bfloat16_t *, svbfloat16x3_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_vnum_bf16)))
void svst3_vnum(svbool_t, bfloat16_t *, int64_t, svbfloat16x3_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_bf16)))
void svst4(svbool_t, bfloat16_t *, svbfloat16x4_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_vnum_bf16)))
void svst4_vnum(svbool_t, bfloat16_t *, int64_t, svbfloat16x4_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_bf16)))
void svstnt1(svbool_t, bfloat16_t *, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_bf16)))
void svstnt1_vnum(svbool_t, bfloat16_t *, int64_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl_bf16)))
svbfloat16_t svtbl(svbfloat16_t, svuint16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_bf16)))
svbfloat16_t svtrn1(svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_bf16)))
svbfloat16_t svtrn2(svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_bf16)))
svbfloat16_t svuzp1(svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_bf16)))
svbfloat16_t svuzp2(svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_bf16)))
svbfloat16_t svzip1(svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_bf16)))
svbfloat16_t svzip2(svbfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_bf16)))
svbfloat16_t svtrn1q_bf16(svbfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_bf16)))
svbfloat16_t svtrn2q_bf16(svbfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_bf16)))
svbfloat16_t svuzp1q_bf16(svbfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_bf16)))
svbfloat16_t svuzp2q_bf16(svbfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_bf16)))
svbfloat16_t svzip1q_bf16(svbfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_bf16)))
svbfloat16_t svzip2q_bf16(svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_bf16)))
svbfloat16_t svtrn1q(svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_bf16)))
svbfloat16_t svtrn2q(svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_bf16)))
svbfloat16_t svuzp1q(svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_bf16)))
svbfloat16_t svuzp2q(svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_bf16)))
svbfloat16_t svzip1q(svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_bf16)))
svbfloat16_t svzip2q(svbfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmmla_f32)))
svfloat32_t svmmla_f32(svfloat32_t, svfloat32_t, svfloat32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmmla_f32)))
svfloat32_t svmmla(svfloat32_t, svfloat32_t, svfloat32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_u8)))
svuint8_t svld1ro_u8(svbool_t, uint8_t const *);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_u32)))
svuint32_t svld1ro_u32(svbool_t, uint32_t const *);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_u64)))
svuint64_t svld1ro_u64(svbool_t, uint64_t const *);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_u16)))
svuint16_t svld1ro_u16(svbool_t, uint16_t const *);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_s8)))
svint8_t svld1ro_s8(svbool_t, int8_t const *);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_f64)))
svfloat64_t svld1ro_f64(svbool_t, float64_t const *);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_f32)))
svfloat32_t svld1ro_f32(svbool_t, float32_t const *);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_f16)))
svfloat16_t svld1ro_f16(svbool_t, float16_t const *);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_s32)))
svint32_t svld1ro_s32(svbool_t, int32_t const *);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_s64)))
svint64_t svld1ro_s64(svbool_t, int64_t const *);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_s16)))
svint16_t svld1ro_s16(svbool_t, int16_t const *);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmmla_f64)))
svfloat64_t svmmla_f64(svfloat64_t, svfloat64_t, svfloat64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_u8)))
svuint8_t svtrn1q_u8(svuint8_t, svuint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_u32)))
svuint32_t svtrn1q_u32(svuint32_t, svuint32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_u64)))
svuint64_t svtrn1q_u64(svuint64_t, svuint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_u16)))
svuint16_t svtrn1q_u16(svuint16_t, svuint16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_s8)))
svint8_t svtrn1q_s8(svint8_t, svint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_f64)))
svfloat64_t svtrn1q_f64(svfloat64_t, svfloat64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_f32)))
svfloat32_t svtrn1q_f32(svfloat32_t, svfloat32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_f16)))
svfloat16_t svtrn1q_f16(svfloat16_t, svfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_s32)))
svint32_t svtrn1q_s32(svint32_t, svint32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_s64)))
svint64_t svtrn1q_s64(svint64_t, svint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_s16)))
svint16_t svtrn1q_s16(svint16_t, svint16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_u8)))
svuint8_t svtrn2q_u8(svuint8_t, svuint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_u32)))
svuint32_t svtrn2q_u32(svuint32_t, svuint32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_u64)))
svuint64_t svtrn2q_u64(svuint64_t, svuint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_u16)))
svuint16_t svtrn2q_u16(svuint16_t, svuint16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_s8)))
svint8_t svtrn2q_s8(svint8_t, svint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_f64)))
svfloat64_t svtrn2q_f64(svfloat64_t, svfloat64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_f32)))
svfloat32_t svtrn2q_f32(svfloat32_t, svfloat32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_f16)))
svfloat16_t svtrn2q_f16(svfloat16_t, svfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_s32)))
svint32_t svtrn2q_s32(svint32_t, svint32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_s64)))
svint64_t svtrn2q_s64(svint64_t, svint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_s16)))
svint16_t svtrn2q_s16(svint16_t, svint16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_u8)))
svuint8_t svuzp1q_u8(svuint8_t, svuint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_u32)))
svuint32_t svuzp1q_u32(svuint32_t, svuint32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_u64)))
svuint64_t svuzp1q_u64(svuint64_t, svuint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_u16)))
svuint16_t svuzp1q_u16(svuint16_t, svuint16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_s8)))
svint8_t svuzp1q_s8(svint8_t, svint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_f64)))
svfloat64_t svuzp1q_f64(svfloat64_t, svfloat64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_f32)))
svfloat32_t svuzp1q_f32(svfloat32_t, svfloat32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_f16)))
svfloat16_t svuzp1q_f16(svfloat16_t, svfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_s32)))
svint32_t svuzp1q_s32(svint32_t, svint32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_s64)))
svint64_t svuzp1q_s64(svint64_t, svint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_s16)))
svint16_t svuzp1q_s16(svint16_t, svint16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_u8)))
svuint8_t svuzp2q_u8(svuint8_t, svuint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_u32)))
svuint32_t svuzp2q_u32(svuint32_t, svuint32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_u64)))
svuint64_t svuzp2q_u64(svuint64_t, svuint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_u16)))
svuint16_t svuzp2q_u16(svuint16_t, svuint16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_s8)))
svint8_t svuzp2q_s8(svint8_t, svint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_f64)))
svfloat64_t svuzp2q_f64(svfloat64_t, svfloat64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_f32)))
svfloat32_t svuzp2q_f32(svfloat32_t, svfloat32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_f16)))
svfloat16_t svuzp2q_f16(svfloat16_t, svfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_s32)))
svint32_t svuzp2q_s32(svint32_t, svint32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_s64)))
svint64_t svuzp2q_s64(svint64_t, svint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_s16)))
svint16_t svuzp2q_s16(svint16_t, svint16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_u8)))
svuint8_t svzip1q_u8(svuint8_t, svuint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_u32)))
svuint32_t svzip1q_u32(svuint32_t, svuint32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_u64)))
svuint64_t svzip1q_u64(svuint64_t, svuint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_u16)))
svuint16_t svzip1q_u16(svuint16_t, svuint16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_s8)))
svint8_t svzip1q_s8(svint8_t, svint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_f64)))
svfloat64_t svzip1q_f64(svfloat64_t, svfloat64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_f32)))
svfloat32_t svzip1q_f32(svfloat32_t, svfloat32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_f16)))
svfloat16_t svzip1q_f16(svfloat16_t, svfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_s32)))
svint32_t svzip1q_s32(svint32_t, svint32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_s64)))
svint64_t svzip1q_s64(svint64_t, svint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_s16)))
svint16_t svzip1q_s16(svint16_t, svint16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_u8)))
svuint8_t svzip2q_u8(svuint8_t, svuint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_u32)))
svuint32_t svzip2q_u32(svuint32_t, svuint32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_u64)))
svuint64_t svzip2q_u64(svuint64_t, svuint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_u16)))
svuint16_t svzip2q_u16(svuint16_t, svuint16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_s8)))
svint8_t svzip2q_s8(svint8_t, svint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_f64)))
svfloat64_t svzip2q_f64(svfloat64_t, svfloat64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_f32)))
svfloat32_t svzip2q_f32(svfloat32_t, svfloat32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_f16)))
svfloat16_t svzip2q_f16(svfloat16_t, svfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_s32)))
svint32_t svzip2q_s32(svint32_t, svint32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_s64)))
svint64_t svzip2q_s64(svint64_t, svint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_s16)))
svint16_t svzip2q_s16(svint16_t, svint16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_u8)))
svuint8_t svld1ro(svbool_t, uint8_t const *);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_u32)))
svuint32_t svld1ro(svbool_t, uint32_t const *);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_u64)))
svuint64_t svld1ro(svbool_t, uint64_t const *);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_u16)))
svuint16_t svld1ro(svbool_t, uint16_t const *);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_s8)))
svint8_t svld1ro(svbool_t, int8_t const *);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_f64)))
svfloat64_t svld1ro(svbool_t, float64_t const *);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_f32)))
svfloat32_t svld1ro(svbool_t, float32_t const *);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_f16)))
svfloat16_t svld1ro(svbool_t, float16_t const *);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_s32)))
svint32_t svld1ro(svbool_t, int32_t const *);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_s64)))
svint64_t svld1ro(svbool_t, int64_t const *);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_s16)))
svint16_t svld1ro(svbool_t, int16_t const *);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmmla_f64)))
svfloat64_t svmmla(svfloat64_t, svfloat64_t, svfloat64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_u8)))
svuint8_t svtrn1q(svuint8_t, svuint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_u32)))
svuint32_t svtrn1q(svuint32_t, svuint32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_u64)))
svuint64_t svtrn1q(svuint64_t, svuint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_u16)))
svuint16_t svtrn1q(svuint16_t, svuint16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_s8)))
svint8_t svtrn1q(svint8_t, svint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_f64)))
svfloat64_t svtrn1q(svfloat64_t, svfloat64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_f32)))
svfloat32_t svtrn1q(svfloat32_t, svfloat32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_f16)))
svfloat16_t svtrn1q(svfloat16_t, svfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_s32)))
svint32_t svtrn1q(svint32_t, svint32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_s64)))
svint64_t svtrn1q(svint64_t, svint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_s16)))
svint16_t svtrn1q(svint16_t, svint16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_u8)))
svuint8_t svtrn2q(svuint8_t, svuint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_u32)))
svuint32_t svtrn2q(svuint32_t, svuint32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_u64)))
svuint64_t svtrn2q(svuint64_t, svuint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_u16)))
svuint16_t svtrn2q(svuint16_t, svuint16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_s8)))
svint8_t svtrn2q(svint8_t, svint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_f64)))
svfloat64_t svtrn2q(svfloat64_t, svfloat64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_f32)))
svfloat32_t svtrn2q(svfloat32_t, svfloat32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_f16)))
svfloat16_t svtrn2q(svfloat16_t, svfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_s32)))
svint32_t svtrn2q(svint32_t, svint32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_s64)))
svint64_t svtrn2q(svint64_t, svint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_s16)))
svint16_t svtrn2q(svint16_t, svint16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_u8)))
svuint8_t svuzp1q(svuint8_t, svuint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_u32)))
svuint32_t svuzp1q(svuint32_t, svuint32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_u64)))
svuint64_t svuzp1q(svuint64_t, svuint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_u16)))
svuint16_t svuzp1q(svuint16_t, svuint16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_s8)))
svint8_t svuzp1q(svint8_t, svint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_f64)))
svfloat64_t svuzp1q(svfloat64_t, svfloat64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_f32)))
svfloat32_t svuzp1q(svfloat32_t, svfloat32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_f16)))
svfloat16_t svuzp1q(svfloat16_t, svfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_s32)))
svint32_t svuzp1q(svint32_t, svint32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_s64)))
svint64_t svuzp1q(svint64_t, svint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_s16)))
svint16_t svuzp1q(svint16_t, svint16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_u8)))
svuint8_t svuzp2q(svuint8_t, svuint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_u32)))
svuint32_t svuzp2q(svuint32_t, svuint32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_u64)))
svuint64_t svuzp2q(svuint64_t, svuint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_u16)))
svuint16_t svuzp2q(svuint16_t, svuint16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_s8)))
svint8_t svuzp2q(svint8_t, svint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_f64)))
svfloat64_t svuzp2q(svfloat64_t, svfloat64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_f32)))
svfloat32_t svuzp2q(svfloat32_t, svfloat32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_f16)))
svfloat16_t svuzp2q(svfloat16_t, svfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_s32)))
svint32_t svuzp2q(svint32_t, svint32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_s64)))
svint64_t svuzp2q(svint64_t, svint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_s16)))
svint16_t svuzp2q(svint16_t, svint16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_u8)))
svuint8_t svzip1q(svuint8_t, svuint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_u32)))
svuint32_t svzip1q(svuint32_t, svuint32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_u64)))
svuint64_t svzip1q(svuint64_t, svuint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_u16)))
svuint16_t svzip1q(svuint16_t, svuint16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_s8)))
svint8_t svzip1q(svint8_t, svint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_f64)))
svfloat64_t svzip1q(svfloat64_t, svfloat64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_f32)))
svfloat32_t svzip1q(svfloat32_t, svfloat32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_f16)))
svfloat16_t svzip1q(svfloat16_t, svfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_s32)))
svint32_t svzip1q(svint32_t, svint32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_s64)))
svint64_t svzip1q(svint64_t, svint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_s16)))
svint16_t svzip1q(svint16_t, svint16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_u8)))
svuint8_t svzip2q(svuint8_t, svuint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_u32)))
svuint32_t svzip2q(svuint32_t, svuint32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_u64)))
svuint64_t svzip2q(svuint64_t, svuint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_u16)))
svuint16_t svzip2q(svuint16_t, svuint16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_s8)))
svint8_t svzip2q(svint8_t, svint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_f64)))
svfloat64_t svzip2q(svfloat64_t, svfloat64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_f32)))
svfloat32_t svzip2q(svfloat32_t, svfloat32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_f16)))
svfloat16_t svzip2q(svfloat16_t, svfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_s32)))
svint32_t svzip2q(svint32_t, svint32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_s64)))
svint64_t svzip2q(svint64_t, svint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_s16)))
svint16_t svzip2q(svint16_t, svint16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_bf16)))
svbfloat16_t svld1ro_bf16(svbool_t, bfloat16_t const *);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_bf16)))
svbfloat16_t svld1ro(svbool_t, bfloat16_t const *);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmmla_s32)))
svint32_t svmmla_s32(svint32_t, svint8_t, svint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmmla_u32)))
svuint32_t svmmla_u32(svuint32_t, svuint8_t, svuint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsudot_n_s32)))
svint32_t svsudot_n_s32(svint32_t, svint8_t, uint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsudot_s32)))
svint32_t svsudot_s32(svint32_t, svint8_t, svuint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsudot_lane_s32)))
svint32_t svsudot_lane_s32(svint32_t, svint8_t, svuint8_t, uint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svusdot_n_s32)))
svint32_t svusdot_n_s32(svint32_t, svuint8_t, int8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svusdot_s32)))
svint32_t svusdot_s32(svint32_t, svuint8_t, svint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svusdot_lane_s32)))
svint32_t svusdot_lane_s32(svint32_t, svuint8_t, svint8_t, uint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svusmmla_s32)))
svint32_t svusmmla_s32(svint32_t, svuint8_t, svint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmmla_s32)))
svint32_t svmmla(svint32_t, svint8_t, svint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmmla_u32)))
svuint32_t svmmla(svuint32_t, svuint8_t, svuint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsudot_n_s32)))
svint32_t svsudot(svint32_t, svint8_t, uint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsudot_s32)))
svint32_t svsudot(svint32_t, svint8_t, svuint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsudot_lane_s32)))
svint32_t svsudot_lane(svint32_t, svint8_t, svuint8_t, uint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svusdot_n_s32)))
svint32_t svusdot(svint32_t, svuint8_t, int8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svusdot_s32)))
svint32_t svusdot(svint32_t, svuint8_t, svint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svusdot_lane_s32)))
svint32_t svusdot_lane(svint32_t, svuint8_t, svint8_t, uint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svusmmla_s32)))
svint32_t svusmmla(svint32_t, svuint8_t, svint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaba_n_s8)))
svint8_t svaba_n_s8(svint8_t, svint8_t, int8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaba_n_s32)))
@@ -23227,31 +23714,22 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svxar_n_s64)))
svint64_t svxar(svint64_t, svint64_t, uint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svxar_n_s16)))
svint16_t svxar(svint16_t, svint16_t, uint64_t);
#endif //defined(__ARM_FEATURE_SVE2)
#if defined(__ARM_FEATURE_SVE2) && defined(__ARM_FEATURE_BF16_SCALAR_ARITHMETIC)
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilerw_bf16)))
svbool_t svwhilerw_bf16(bfloat16_t const *, bfloat16_t const *);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilewr_bf16)))
svbool_t svwhilewr_bf16(bfloat16_t const *, bfloat16_t const *);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilerw_bf16)))
svbool_t svwhilerw(bfloat16_t const *, bfloat16_t const *);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilewr_bf16)))
svbool_t svwhilewr(bfloat16_t const *, bfloat16_t const *);
#endif //defined(__ARM_FEATURE_SVE2) && defined(__ARM_FEATURE_BF16_SCALAR_ARITHMETIC)
#if defined(__ARM_FEATURE_SVE2) && defined(__ARM_FEATURE_SVE_BF16)
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl2_bf16)))
svbfloat16_t svtbl2_bf16(svbfloat16x2_t, svuint16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbx_bf16)))
svbfloat16_t svtbx_bf16(svbfloat16_t, svbfloat16_t, svuint16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilerw_bf16)))
svbool_t svwhilerw_bf16(bfloat16_t const *, bfloat16_t const *);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilewr_bf16)))
svbool_t svwhilewr_bf16(bfloat16_t const *, bfloat16_t const *);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl2_bf16)))
svbfloat16_t svtbl2(svbfloat16x2_t, svuint16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbx_bf16)))
svbfloat16_t svtbx(svbfloat16_t, svbfloat16_t, svuint16_t);
#endif //defined(__ARM_FEATURE_SVE2) && defined(__ARM_FEATURE_SVE_BF16)
#if defined(__ARM_FEATURE_SVE2_AES)
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilerw_bf16)))
svbool_t svwhilerw(bfloat16_t const *, bfloat16_t const *);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilewr_bf16)))
svbool_t svwhilewr(bfloat16_t const *, bfloat16_t const *);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaesd_u8)))
svuint8_t svaesd_u8(svuint8_t, svuint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaese_u8)))
@@ -23284,9 +23762,102 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullt_pair_n_u64
svuint64_t svpmullt_pair(svuint64_t, uint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullt_pair_u64)))
svuint64_t svpmullt_pair(svuint64_t, svuint64_t);
#endif //defined(__ARM_FEATURE_SVE2_AES)
#if defined(__ARM_FEATURE_SVE2_SHA3)
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_n_u8)))
svuint8_t svbdep_n_u8(svuint8_t, uint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_n_u32)))
svuint32_t svbdep_n_u32(svuint32_t, uint32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_n_u64)))
svuint64_t svbdep_n_u64(svuint64_t, uint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_n_u16)))
svuint16_t svbdep_n_u16(svuint16_t, uint16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_u8)))
svuint8_t svbdep_u8(svuint8_t, svuint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_u32)))
svuint32_t svbdep_u32(svuint32_t, svuint32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_u64)))
svuint64_t svbdep_u64(svuint64_t, svuint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_u16)))
svuint16_t svbdep_u16(svuint16_t, svuint16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_n_u8)))
svuint8_t svbext_n_u8(svuint8_t, uint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_n_u32)))
svuint32_t svbext_n_u32(svuint32_t, uint32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_n_u64)))
svuint64_t svbext_n_u64(svuint64_t, uint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_n_u16)))
svuint16_t svbext_n_u16(svuint16_t, uint16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_u8)))
svuint8_t svbext_u8(svuint8_t, svuint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_u32)))
svuint32_t svbext_u32(svuint32_t, svuint32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_u64)))
svuint64_t svbext_u64(svuint64_t, svuint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_u16)))
svuint16_t svbext_u16(svuint16_t, svuint16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_n_u8)))
svuint8_t svbgrp_n_u8(svuint8_t, uint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_n_u32)))
svuint32_t svbgrp_n_u32(svuint32_t, uint32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_n_u64)))
svuint64_t svbgrp_n_u64(svuint64_t, uint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_n_u16)))
svuint16_t svbgrp_n_u16(svuint16_t, uint16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_u8)))
svuint8_t svbgrp_u8(svuint8_t, svuint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_u32)))
svuint32_t svbgrp_u32(svuint32_t, svuint32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_u64)))
svuint64_t svbgrp_u64(svuint64_t, svuint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_u16)))
svuint16_t svbgrp_u16(svuint16_t, svuint16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_n_u8)))
svuint8_t svbdep(svuint8_t, uint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_n_u32)))
svuint32_t svbdep(svuint32_t, uint32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_n_u64)))
svuint64_t svbdep(svuint64_t, uint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_n_u16)))
svuint16_t svbdep(svuint16_t, uint16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_u8)))
svuint8_t svbdep(svuint8_t, svuint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_u32)))
svuint32_t svbdep(svuint32_t, svuint32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_u64)))
svuint64_t svbdep(svuint64_t, svuint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_u16)))
svuint16_t svbdep(svuint16_t, svuint16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_n_u8)))
svuint8_t svbext(svuint8_t, uint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_n_u32)))
svuint32_t svbext(svuint32_t, uint32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_n_u64)))
svuint64_t svbext(svuint64_t, uint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_n_u16)))
svuint16_t svbext(svuint16_t, uint16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_u8)))
svuint8_t svbext(svuint8_t, svuint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_u32)))
svuint32_t svbext(svuint32_t, svuint32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_u64)))
svuint64_t svbext(svuint64_t, svuint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_u16)))
svuint16_t svbext(svuint16_t, svuint16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_n_u8)))
svuint8_t svbgrp(svuint8_t, uint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_n_u32)))
svuint32_t svbgrp(svuint32_t, uint32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_n_u64)))
svuint64_t svbgrp(svuint64_t, uint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_n_u16)))
svuint16_t svbgrp(svuint16_t, uint16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_u8)))
svuint8_t svbgrp(svuint8_t, svuint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_u32)))
svuint32_t svbgrp(svuint32_t, svuint32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_u64)))
svuint64_t svbgrp(svuint64_t, svuint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_u16)))
svuint16_t svbgrp(svuint16_t, svuint16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrax1_u64)))
svuint64_t svrax1_u64(svuint64_t, svuint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrax1_s64)))
@@ -23295,9 +23866,6 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrax1_u64)))
svuint64_t svrax1(svuint64_t, svuint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrax1_s64)))
svint64_t svrax1(svint64_t, svint64_t);
#endif //defined(__ARM_FEATURE_SVE2_SHA3)
#if defined(__ARM_FEATURE_SVE2_SM4)
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsm4e_u32)))
svuint32_t svsm4e_u32(svuint32_t, svuint32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsm4ekey_u32)))
@@ -23306,724 +23874,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsm4e_u32)))
svuint32_t svsm4e(svuint32_t, svuint32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsm4ekey_u32)))
svuint32_t svsm4ekey(svuint32_t, svuint32_t);
#endif //defined(__ARM_FEATURE_SVE2_SM4)
#if defined(__ARM_FEATURE_SVE_BF16)
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfdot_n_f32)))
svfloat32_t svbfdot_n_f32(svfloat32_t, svbfloat16_t, bfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfdot_f32)))
svfloat32_t svbfdot_f32(svfloat32_t, svbfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfdot_lane_f32)))
svfloat32_t svbfdot_lane_f32(svfloat32_t, svbfloat16_t, svbfloat16_t, uint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmlalb_n_f32)))
svfloat32_t svbfmlalb_n_f32(svfloat32_t, svbfloat16_t, bfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmlalb_f32)))
svfloat32_t svbfmlalb_f32(svfloat32_t, svbfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmlalb_lane_f32)))
svfloat32_t svbfmlalb_lane_f32(svfloat32_t, svbfloat16_t, svbfloat16_t, uint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmlalt_n_f32)))
svfloat32_t svbfmlalt_n_f32(svfloat32_t, svbfloat16_t, bfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmlalt_f32)))
svfloat32_t svbfmlalt_f32(svfloat32_t, svbfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmlalt_lane_f32)))
svfloat32_t svbfmlalt_lane_f32(svfloat32_t, svbfloat16_t, svbfloat16_t, uint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmmla_f32)))
svfloat32_t svbfmmla_f32(svfloat32_t, svbfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_n_bf16)))
bfloat16_t svclasta_n_bf16(svbool_t, bfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_bf16)))
svbfloat16_t svclasta_bf16(svbool_t, svbfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_n_bf16)))
bfloat16_t svclastb_n_bf16(svbool_t, bfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_bf16)))
svbfloat16_t svclastb_bf16(svbool_t, svbfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_bf16_m)))
svuint16_t svcnt_bf16_m(svuint16_t, svbool_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_bf16_x)))
svuint16_t svcnt_bf16_x(svbool_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_bf16_z)))
svuint16_t svcnt_bf16_z(svbool_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate2_bf16)))
svbfloat16x2_t svcreate2_bf16(svbfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate3_bf16)))
svbfloat16x3_t svcreate3_bf16(svbfloat16_t, svbfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate4_bf16)))
svbfloat16x4_t svcreate4_bf16(svbfloat16_t, svbfloat16_t, svbfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_bf16_f32_m)))
svbfloat16_t svcvt_bf16_f32_m(svbfloat16_t, svbool_t, svfloat32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_bf16_f32_x)))
svbfloat16_t svcvt_bf16_f32_x(svbool_t, svfloat32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_bf16_f32_z)))
svbfloat16_t svcvt_bf16_f32_z(svbool_t, svfloat32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtnt_bf16_f32_m)))
svbfloat16_t svcvtnt_bf16_f32_m(svbfloat16_t, svbool_t, svfloat32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_bf16)))
svbfloat16_t svdup_n_bf16(bfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_bf16_m)))
svbfloat16_t svdup_n_bf16_m(svbfloat16_t, svbool_t, bfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_bf16_x)))
svbfloat16_t svdup_n_bf16_x(svbool_t, bfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_bf16_z)))
svbfloat16_t svdup_n_bf16_z(svbool_t, bfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_lane_bf16)))
svbfloat16_t svdup_lane_bf16(svbfloat16_t, uint16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_bf16)))
svbfloat16_t svdupq_n_bf16(bfloat16_t, bfloat16_t, bfloat16_t, bfloat16_t, bfloat16_t, bfloat16_t, bfloat16_t, bfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_lane_bf16)))
svbfloat16_t svdupq_lane_bf16(svbfloat16_t, uint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svext_bf16)))
svbfloat16_t svext_bf16(svbfloat16_t, svbfloat16_t, uint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget2_bf16)))
svbfloat16_t svget2_bf16(svbfloat16x2_t, uint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget3_bf16)))
svbfloat16_t svget3_bf16(svbfloat16x3_t, uint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget4_bf16)))
svbfloat16_t svget4_bf16(svbfloat16x4_t, uint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svinsr_n_bf16)))
svbfloat16_t svinsr_n_bf16(svbfloat16_t, bfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlasta_bf16)))
bfloat16_t svlasta_bf16(svbool_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlastb_bf16)))
bfloat16_t svlastb_bf16(svbool_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_bf16)))
svbfloat16_t svld1_bf16(svbool_t, bfloat16_t const *);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_bf16)))
svbfloat16_t svld1_vnum_bf16(svbool_t, bfloat16_t const *, int64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1rq_bf16)))
svbfloat16_t svld1rq_bf16(svbool_t, bfloat16_t const *);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_bf16)))
svbfloat16x2_t svld2_bf16(svbool_t, bfloat16_t const *);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_vnum_bf16)))
svbfloat16x2_t svld2_vnum_bf16(svbool_t, bfloat16_t const *, int64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_bf16)))
svbfloat16x3_t svld3_bf16(svbool_t, bfloat16_t const *);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_vnum_bf16)))
svbfloat16x3_t svld3_vnum_bf16(svbool_t, bfloat16_t const *, int64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_bf16)))
svbfloat16x4_t svld4_bf16(svbool_t, bfloat16_t const *);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_vnum_bf16)))
svbfloat16x4_t svld4_vnum_bf16(svbool_t, bfloat16_t const *, int64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_bf16)))
svbfloat16_t svldff1_bf16(svbool_t, bfloat16_t const *);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_vnum_bf16)))
svbfloat16_t svldff1_vnum_bf16(svbool_t, bfloat16_t const *, int64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_bf16)))
svbfloat16_t svldnf1_bf16(svbool_t, bfloat16_t const *);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_vnum_bf16)))
svbfloat16_t svldnf1_vnum_bf16(svbool_t, bfloat16_t const *, int64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_bf16)))
svbfloat16_t svldnt1_bf16(svbool_t, bfloat16_t const *);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_bf16)))
svbfloat16_t svldnt1_vnum_bf16(svbool_t, bfloat16_t const *, int64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlen_bf16)))
uint64_t svlen_bf16(svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_bf16)))
svbfloat16_t svrev_bf16(svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_bf16)))
svbfloat16_t svsel_bf16(svbool_t, svbfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset2_bf16)))
svbfloat16x2_t svset2_bf16(svbfloat16x2_t, uint64_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset3_bf16)))
svbfloat16x3_t svset3_bf16(svbfloat16x3_t, uint64_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset4_bf16)))
svbfloat16x4_t svset4_bf16(svbfloat16x4_t, uint64_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsplice_bf16)))
svbfloat16_t svsplice_bf16(svbool_t, svbfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_bf16)))
void svst1_bf16(svbool_t, bfloat16_t *, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_bf16)))
void svst1_vnum_bf16(svbool_t, bfloat16_t *, int64_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_bf16)))
void svst2_bf16(svbool_t, bfloat16_t *, svbfloat16x2_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_vnum_bf16)))
void svst2_vnum_bf16(svbool_t, bfloat16_t *, int64_t, svbfloat16x2_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_bf16)))
void svst3_bf16(svbool_t, bfloat16_t *, svbfloat16x3_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_vnum_bf16)))
void svst3_vnum_bf16(svbool_t, bfloat16_t *, int64_t, svbfloat16x3_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_bf16)))
void svst4_bf16(svbool_t, bfloat16_t *, svbfloat16x4_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_vnum_bf16)))
void svst4_vnum_bf16(svbool_t, bfloat16_t *, int64_t, svbfloat16x4_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_bf16)))
void svstnt1_bf16(svbool_t, bfloat16_t *, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_bf16)))
void svstnt1_vnum_bf16(svbool_t, bfloat16_t *, int64_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl_bf16)))
svbfloat16_t svtbl_bf16(svbfloat16_t, svuint16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_bf16)))
svbfloat16_t svtrn1_bf16(svbfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_bf16)))
svbfloat16_t svtrn2_bf16(svbfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_bf16)))
svbfloat16x2_t svundef2_bf16(void);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_bf16)))
svbfloat16x3_t svundef3_bf16(void);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_bf16)))
svbfloat16x4_t svundef4_bf16(void);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_bf16)))
svbfloat16_t svundef_bf16(void);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_bf16)))
svbfloat16_t svuzp1_bf16(svbfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_bf16)))
svbfloat16_t svuzp2_bf16(svbfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_bf16)))
svbfloat16_t svzip1_bf16(svbfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_bf16)))
svbfloat16_t svzip2_bf16(svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfdot_n_f32)))
svfloat32_t svbfdot(svfloat32_t, svbfloat16_t, bfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfdot_f32)))
svfloat32_t svbfdot(svfloat32_t, svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfdot_lane_f32)))
svfloat32_t svbfdot_lane(svfloat32_t, svbfloat16_t, svbfloat16_t, uint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmlalb_n_f32)))
svfloat32_t svbfmlalb(svfloat32_t, svbfloat16_t, bfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmlalb_f32)))
svfloat32_t svbfmlalb(svfloat32_t, svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmlalb_lane_f32)))
svfloat32_t svbfmlalb_lane(svfloat32_t, svbfloat16_t, svbfloat16_t, uint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmlalt_n_f32)))
svfloat32_t svbfmlalt(svfloat32_t, svbfloat16_t, bfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmlalt_f32)))
svfloat32_t svbfmlalt(svfloat32_t, svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmlalt_lane_f32)))
svfloat32_t svbfmlalt_lane(svfloat32_t, svbfloat16_t, svbfloat16_t, uint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmmla_f32)))
svfloat32_t svbfmmla(svfloat32_t, svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_n_bf16)))
bfloat16_t svclasta(svbool_t, bfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_bf16)))
svbfloat16_t svclasta(svbool_t, svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_n_bf16)))
bfloat16_t svclastb(svbool_t, bfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_bf16)))
svbfloat16_t svclastb(svbool_t, svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_bf16_m)))
svuint16_t svcnt_m(svuint16_t, svbool_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_bf16_x)))
svuint16_t svcnt_x(svbool_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_bf16_z)))
svuint16_t svcnt_z(svbool_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate2_bf16)))
svbfloat16x2_t svcreate2(svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate3_bf16)))
svbfloat16x3_t svcreate3(svbfloat16_t, svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate4_bf16)))
svbfloat16x4_t svcreate4(svbfloat16_t, svbfloat16_t, svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_bf16_f32_m)))
svbfloat16_t svcvt_bf16_m(svbfloat16_t, svbool_t, svfloat32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_bf16_f32_x)))
svbfloat16_t svcvt_bf16_x(svbool_t, svfloat32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_bf16_f32_z)))
svbfloat16_t svcvt_bf16_z(svbool_t, svfloat32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtnt_bf16_f32_m)))
svbfloat16_t svcvtnt_bf16_m(svbfloat16_t, svbool_t, svfloat32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_bf16)))
svbfloat16_t svdup_bf16(bfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_bf16_m)))
svbfloat16_t svdup_bf16_m(svbfloat16_t, svbool_t, bfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_bf16_x)))
svbfloat16_t svdup_bf16_x(svbool_t, bfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_bf16_z)))
svbfloat16_t svdup_bf16_z(svbool_t, bfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_lane_bf16)))
svbfloat16_t svdup_lane(svbfloat16_t, uint16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_bf16)))
svbfloat16_t svdupq_bf16(bfloat16_t, bfloat16_t, bfloat16_t, bfloat16_t, bfloat16_t, bfloat16_t, bfloat16_t, bfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_lane_bf16)))
svbfloat16_t svdupq_lane(svbfloat16_t, uint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svext_bf16)))
svbfloat16_t svext(svbfloat16_t, svbfloat16_t, uint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget2_bf16)))
svbfloat16_t svget2(svbfloat16x2_t, uint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget3_bf16)))
svbfloat16_t svget3(svbfloat16x3_t, uint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget4_bf16)))
svbfloat16_t svget4(svbfloat16x4_t, uint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svinsr_n_bf16)))
svbfloat16_t svinsr(svbfloat16_t, bfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlasta_bf16)))
bfloat16_t svlasta(svbool_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlastb_bf16)))
bfloat16_t svlastb(svbool_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_bf16)))
svbfloat16_t svld1(svbool_t, bfloat16_t const *);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_bf16)))
svbfloat16_t svld1_vnum(svbool_t, bfloat16_t const *, int64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1rq_bf16)))
svbfloat16_t svld1rq(svbool_t, bfloat16_t const *);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_bf16)))
svbfloat16x2_t svld2(svbool_t, bfloat16_t const *);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_vnum_bf16)))
svbfloat16x2_t svld2_vnum(svbool_t, bfloat16_t const *, int64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_bf16)))
svbfloat16x3_t svld3(svbool_t, bfloat16_t const *);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_vnum_bf16)))
svbfloat16x3_t svld3_vnum(svbool_t, bfloat16_t const *, int64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_bf16)))
svbfloat16x4_t svld4(svbool_t, bfloat16_t const *);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_vnum_bf16)))
svbfloat16x4_t svld4_vnum(svbool_t, bfloat16_t const *, int64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_bf16)))
svbfloat16_t svldff1(svbool_t, bfloat16_t const *);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_vnum_bf16)))
svbfloat16_t svldff1_vnum(svbool_t, bfloat16_t const *, int64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_bf16)))
svbfloat16_t svldnf1(svbool_t, bfloat16_t const *);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_vnum_bf16)))
svbfloat16_t svldnf1_vnum(svbool_t, bfloat16_t const *, int64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_bf16)))
svbfloat16_t svldnt1(svbool_t, bfloat16_t const *);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_bf16)))
svbfloat16_t svldnt1_vnum(svbool_t, bfloat16_t const *, int64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlen_bf16)))
uint64_t svlen(svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_bf16)))
svbfloat16_t svrev(svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_bf16)))
svbfloat16_t svsel(svbool_t, svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset2_bf16)))
svbfloat16x2_t svset2(svbfloat16x2_t, uint64_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset3_bf16)))
svbfloat16x3_t svset3(svbfloat16x3_t, uint64_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset4_bf16)))
svbfloat16x4_t svset4(svbfloat16x4_t, uint64_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsplice_bf16)))
svbfloat16_t svsplice(svbool_t, svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_bf16)))
void svst1(svbool_t, bfloat16_t *, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_bf16)))
void svst1_vnum(svbool_t, bfloat16_t *, int64_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_bf16)))
void svst2(svbool_t, bfloat16_t *, svbfloat16x2_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_vnum_bf16)))
void svst2_vnum(svbool_t, bfloat16_t *, int64_t, svbfloat16x2_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_bf16)))
void svst3(svbool_t, bfloat16_t *, svbfloat16x3_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_vnum_bf16)))
void svst3_vnum(svbool_t, bfloat16_t *, int64_t, svbfloat16x3_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_bf16)))
void svst4(svbool_t, bfloat16_t *, svbfloat16x4_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_vnum_bf16)))
void svst4_vnum(svbool_t, bfloat16_t *, int64_t, svbfloat16x4_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_bf16)))
void svstnt1(svbool_t, bfloat16_t *, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_bf16)))
void svstnt1_vnum(svbool_t, bfloat16_t *, int64_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl_bf16)))
svbfloat16_t svtbl(svbfloat16_t, svuint16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_bf16)))
svbfloat16_t svtrn1(svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_bf16)))
svbfloat16_t svtrn2(svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_bf16)))
svbfloat16_t svuzp1(svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_bf16)))
svbfloat16_t svuzp2(svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_bf16)))
svbfloat16_t svzip1(svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_bf16)))
svbfloat16_t svzip2(svbfloat16_t, svbfloat16_t);
#endif //defined(__ARM_FEATURE_SVE_BF16)
#if defined(__ARM_FEATURE_SVE_MATMUL_FP32)
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmmla_f32)))
svfloat32_t svmmla_f32(svfloat32_t, svfloat32_t, svfloat32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmmla_f32)))
svfloat32_t svmmla(svfloat32_t, svfloat32_t, svfloat32_t);
#endif //defined(__ARM_FEATURE_SVE_MATMUL_FP32)
#if defined(__ARM_FEATURE_SVE_MATMUL_FP64)
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_u8)))
svuint8_t svld1ro_u8(svbool_t, uint8_t const *);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_u32)))
svuint32_t svld1ro_u32(svbool_t, uint32_t const *);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_u64)))
svuint64_t svld1ro_u64(svbool_t, uint64_t const *);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_u16)))
svuint16_t svld1ro_u16(svbool_t, uint16_t const *);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_s8)))
svint8_t svld1ro_s8(svbool_t, int8_t const *);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_f64)))
svfloat64_t svld1ro_f64(svbool_t, float64_t const *);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_f32)))
svfloat32_t svld1ro_f32(svbool_t, float32_t const *);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_f16)))
svfloat16_t svld1ro_f16(svbool_t, float16_t const *);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_s32)))
svint32_t svld1ro_s32(svbool_t, int32_t const *);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_s64)))
svint64_t svld1ro_s64(svbool_t, int64_t const *);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_s16)))
svint16_t svld1ro_s16(svbool_t, int16_t const *);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmmla_f64)))
svfloat64_t svmmla_f64(svfloat64_t, svfloat64_t, svfloat64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_u8)))
svuint8_t svtrn1q_u8(svuint8_t, svuint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_u32)))
svuint32_t svtrn1q_u32(svuint32_t, svuint32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_u64)))
svuint64_t svtrn1q_u64(svuint64_t, svuint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_u16)))
svuint16_t svtrn1q_u16(svuint16_t, svuint16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_s8)))
svint8_t svtrn1q_s8(svint8_t, svint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_f64)))
svfloat64_t svtrn1q_f64(svfloat64_t, svfloat64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_f32)))
svfloat32_t svtrn1q_f32(svfloat32_t, svfloat32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_f16)))
svfloat16_t svtrn1q_f16(svfloat16_t, svfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_s32)))
svint32_t svtrn1q_s32(svint32_t, svint32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_s64)))
svint64_t svtrn1q_s64(svint64_t, svint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_s16)))
svint16_t svtrn1q_s16(svint16_t, svint16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_u8)))
svuint8_t svtrn2q_u8(svuint8_t, svuint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_u32)))
svuint32_t svtrn2q_u32(svuint32_t, svuint32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_u64)))
svuint64_t svtrn2q_u64(svuint64_t, svuint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_u16)))
svuint16_t svtrn2q_u16(svuint16_t, svuint16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_s8)))
svint8_t svtrn2q_s8(svint8_t, svint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_f64)))
svfloat64_t svtrn2q_f64(svfloat64_t, svfloat64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_f32)))
svfloat32_t svtrn2q_f32(svfloat32_t, svfloat32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_f16)))
svfloat16_t svtrn2q_f16(svfloat16_t, svfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_s32)))
svint32_t svtrn2q_s32(svint32_t, svint32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_s64)))
svint64_t svtrn2q_s64(svint64_t, svint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_s16)))
svint16_t svtrn2q_s16(svint16_t, svint16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_u8)))
svuint8_t svuzp1q_u8(svuint8_t, svuint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_u32)))
svuint32_t svuzp1q_u32(svuint32_t, svuint32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_u64)))
svuint64_t svuzp1q_u64(svuint64_t, svuint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_u16)))
svuint16_t svuzp1q_u16(svuint16_t, svuint16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_s8)))
svint8_t svuzp1q_s8(svint8_t, svint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_f64)))
svfloat64_t svuzp1q_f64(svfloat64_t, svfloat64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_f32)))
svfloat32_t svuzp1q_f32(svfloat32_t, svfloat32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_f16)))
svfloat16_t svuzp1q_f16(svfloat16_t, svfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_s32)))
svint32_t svuzp1q_s32(svint32_t, svint32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_s64)))
svint64_t svuzp1q_s64(svint64_t, svint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_s16)))
svint16_t svuzp1q_s16(svint16_t, svint16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_u8)))
svuint8_t svuzp2q_u8(svuint8_t, svuint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_u32)))
svuint32_t svuzp2q_u32(svuint32_t, svuint32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_u64)))
svuint64_t svuzp2q_u64(svuint64_t, svuint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_u16)))
svuint16_t svuzp2q_u16(svuint16_t, svuint16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_s8)))
svint8_t svuzp2q_s8(svint8_t, svint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_f64)))
svfloat64_t svuzp2q_f64(svfloat64_t, svfloat64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_f32)))
svfloat32_t svuzp2q_f32(svfloat32_t, svfloat32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_f16)))
svfloat16_t svuzp2q_f16(svfloat16_t, svfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_s32)))
svint32_t svuzp2q_s32(svint32_t, svint32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_s64)))
svint64_t svuzp2q_s64(svint64_t, svint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_s16)))
svint16_t svuzp2q_s16(svint16_t, svint16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_u8)))
svuint8_t svzip1q_u8(svuint8_t, svuint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_u32)))
svuint32_t svzip1q_u32(svuint32_t, svuint32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_u64)))
svuint64_t svzip1q_u64(svuint64_t, svuint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_u16)))
svuint16_t svzip1q_u16(svuint16_t, svuint16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_s8)))
svint8_t svzip1q_s8(svint8_t, svint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_f64)))
svfloat64_t svzip1q_f64(svfloat64_t, svfloat64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_f32)))
svfloat32_t svzip1q_f32(svfloat32_t, svfloat32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_f16)))
svfloat16_t svzip1q_f16(svfloat16_t, svfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_s32)))
svint32_t svzip1q_s32(svint32_t, svint32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_s64)))
svint64_t svzip1q_s64(svint64_t, svint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_s16)))
svint16_t svzip1q_s16(svint16_t, svint16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_u8)))
svuint8_t svzip2q_u8(svuint8_t, svuint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_u32)))
svuint32_t svzip2q_u32(svuint32_t, svuint32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_u64)))
svuint64_t svzip2q_u64(svuint64_t, svuint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_u16)))
svuint16_t svzip2q_u16(svuint16_t, svuint16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_s8)))
svint8_t svzip2q_s8(svint8_t, svint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_f64)))
svfloat64_t svzip2q_f64(svfloat64_t, svfloat64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_f32)))
svfloat32_t svzip2q_f32(svfloat32_t, svfloat32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_f16)))
svfloat16_t svzip2q_f16(svfloat16_t, svfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_s32)))
svint32_t svzip2q_s32(svint32_t, svint32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_s64)))
svint64_t svzip2q_s64(svint64_t, svint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_s16)))
svint16_t svzip2q_s16(svint16_t, svint16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_u8)))
svuint8_t svld1ro(svbool_t, uint8_t const *);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_u32)))
svuint32_t svld1ro(svbool_t, uint32_t const *);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_u64)))
svuint64_t svld1ro(svbool_t, uint64_t const *);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_u16)))
svuint16_t svld1ro(svbool_t, uint16_t const *);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_s8)))
svint8_t svld1ro(svbool_t, int8_t const *);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_f64)))
svfloat64_t svld1ro(svbool_t, float64_t const *);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_f32)))
svfloat32_t svld1ro(svbool_t, float32_t const *);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_f16)))
svfloat16_t svld1ro(svbool_t, float16_t const *);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_s32)))
svint32_t svld1ro(svbool_t, int32_t const *);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_s64)))
svint64_t svld1ro(svbool_t, int64_t const *);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_s16)))
svint16_t svld1ro(svbool_t, int16_t const *);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmmla_f64)))
svfloat64_t svmmla(svfloat64_t, svfloat64_t, svfloat64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_u8)))
svuint8_t svtrn1q(svuint8_t, svuint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_u32)))
svuint32_t svtrn1q(svuint32_t, svuint32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_u64)))
svuint64_t svtrn1q(svuint64_t, svuint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_u16)))
svuint16_t svtrn1q(svuint16_t, svuint16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_s8)))
svint8_t svtrn1q(svint8_t, svint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_f64)))
svfloat64_t svtrn1q(svfloat64_t, svfloat64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_f32)))
svfloat32_t svtrn1q(svfloat32_t, svfloat32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_f16)))
svfloat16_t svtrn1q(svfloat16_t, svfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_s32)))
svint32_t svtrn1q(svint32_t, svint32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_s64)))
svint64_t svtrn1q(svint64_t, svint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_s16)))
svint16_t svtrn1q(svint16_t, svint16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_u8)))
svuint8_t svtrn2q(svuint8_t, svuint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_u32)))
svuint32_t svtrn2q(svuint32_t, svuint32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_u64)))
svuint64_t svtrn2q(svuint64_t, svuint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_u16)))
svuint16_t svtrn2q(svuint16_t, svuint16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_s8)))
svint8_t svtrn2q(svint8_t, svint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_f64)))
svfloat64_t svtrn2q(svfloat64_t, svfloat64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_f32)))
svfloat32_t svtrn2q(svfloat32_t, svfloat32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_f16)))
svfloat16_t svtrn2q(svfloat16_t, svfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_s32)))
svint32_t svtrn2q(svint32_t, svint32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_s64)))
svint64_t svtrn2q(svint64_t, svint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_s16)))
svint16_t svtrn2q(svint16_t, svint16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_u8)))
svuint8_t svuzp1q(svuint8_t, svuint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_u32)))
svuint32_t svuzp1q(svuint32_t, svuint32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_u64)))
svuint64_t svuzp1q(svuint64_t, svuint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_u16)))
svuint16_t svuzp1q(svuint16_t, svuint16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_s8)))
svint8_t svuzp1q(svint8_t, svint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_f64)))
svfloat64_t svuzp1q(svfloat64_t, svfloat64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_f32)))
svfloat32_t svuzp1q(svfloat32_t, svfloat32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_f16)))
svfloat16_t svuzp1q(svfloat16_t, svfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_s32)))
svint32_t svuzp1q(svint32_t, svint32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_s64)))
svint64_t svuzp1q(svint64_t, svint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_s16)))
svint16_t svuzp1q(svint16_t, svint16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_u8)))
svuint8_t svuzp2q(svuint8_t, svuint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_u32)))
svuint32_t svuzp2q(svuint32_t, svuint32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_u64)))
svuint64_t svuzp2q(svuint64_t, svuint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_u16)))
svuint16_t svuzp2q(svuint16_t, svuint16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_s8)))
svint8_t svuzp2q(svint8_t, svint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_f64)))
svfloat64_t svuzp2q(svfloat64_t, svfloat64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_f32)))
svfloat32_t svuzp2q(svfloat32_t, svfloat32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_f16)))
svfloat16_t svuzp2q(svfloat16_t, svfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_s32)))
svint32_t svuzp2q(svint32_t, svint32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_s64)))
svint64_t svuzp2q(svint64_t, svint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_s16)))
svint16_t svuzp2q(svint16_t, svint16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_u8)))
svuint8_t svzip1q(svuint8_t, svuint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_u32)))
svuint32_t svzip1q(svuint32_t, svuint32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_u64)))
svuint64_t svzip1q(svuint64_t, svuint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_u16)))
svuint16_t svzip1q(svuint16_t, svuint16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_s8)))
svint8_t svzip1q(svint8_t, svint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_f64)))
svfloat64_t svzip1q(svfloat64_t, svfloat64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_f32)))
svfloat32_t svzip1q(svfloat32_t, svfloat32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_f16)))
svfloat16_t svzip1q(svfloat16_t, svfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_s32)))
svint32_t svzip1q(svint32_t, svint32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_s64)))
svint64_t svzip1q(svint64_t, svint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_s16)))
svint16_t svzip1q(svint16_t, svint16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_u8)))
svuint8_t svzip2q(svuint8_t, svuint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_u32)))
svuint32_t svzip2q(svuint32_t, svuint32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_u64)))
svuint64_t svzip2q(svuint64_t, svuint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_u16)))
svuint16_t svzip2q(svuint16_t, svuint16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_s8)))
svint8_t svzip2q(svint8_t, svint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_f64)))
svfloat64_t svzip2q(svfloat64_t, svfloat64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_f32)))
svfloat32_t svzip2q(svfloat32_t, svfloat32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_f16)))
svfloat16_t svzip2q(svfloat16_t, svfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_s32)))
svint32_t svzip2q(svint32_t, svint32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_s64)))
svint64_t svzip2q(svint64_t, svint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_s16)))
svint16_t svzip2q(svint16_t, svint16_t);
#endif //defined(__ARM_FEATURE_SVE_MATMUL_FP64)
#if defined(__ARM_FEATURE_SVE_MATMUL_FP64) && defined(__ARM_FEATURE_SVE_BF16)
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_bf16)))
svbfloat16_t svld1ro_bf16(svbool_t, bfloat16_t const *);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_bf16)))
svbfloat16_t svtrn1q_bf16(svbfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_bf16)))
svbfloat16_t svtrn2q_bf16(svbfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_bf16)))
svbfloat16_t svuzp1q_bf16(svbfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_bf16)))
svbfloat16_t svuzp2q_bf16(svbfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_bf16)))
svbfloat16_t svzip1q_bf16(svbfloat16_t, svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_bf16)))
svbfloat16_t svzip2q_bf16(svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_bf16)))
svbfloat16_t svld1ro(svbool_t, bfloat16_t const *);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_bf16)))
svbfloat16_t svtrn1q(svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_bf16)))
svbfloat16_t svtrn2q(svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_bf16)))
svbfloat16_t svuzp1q(svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_bf16)))
svbfloat16_t svuzp2q(svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_bf16)))
svbfloat16_t svzip1q(svbfloat16_t, svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_bf16)))
svbfloat16_t svzip2q(svbfloat16_t, svbfloat16_t);
#endif //defined(__ARM_FEATURE_SVE_MATMUL_FP64) && defined(__ARM_FEATURE_SVE_BF16)
#if defined(__ARM_FEATURE_SVE_MATMUL_INT8)
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmmla_s32)))
svint32_t svmmla_s32(svint32_t, svint8_t, svint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmmla_u32)))
svuint32_t svmmla_u32(svuint32_t, svuint8_t, svuint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsudot_n_s32)))
svint32_t svsudot_n_s32(svint32_t, svint8_t, uint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsudot_s32)))
svint32_t svsudot_s32(svint32_t, svint8_t, svuint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsudot_lane_s32)))
svint32_t svsudot_lane_s32(svint32_t, svint8_t, svuint8_t, uint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svusdot_n_s32)))
svint32_t svusdot_n_s32(svint32_t, svuint8_t, int8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svusdot_s32)))
svint32_t svusdot_s32(svint32_t, svuint8_t, svint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svusdot_lane_s32)))
svint32_t svusdot_lane_s32(svint32_t, svuint8_t, svint8_t, uint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svusmmla_s32)))
svint32_t svusmmla_s32(svint32_t, svuint8_t, svint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmmla_s32)))
svint32_t svmmla(svint32_t, svint8_t, svint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmmla_u32)))
svuint32_t svmmla(svuint32_t, svuint8_t, svuint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsudot_n_s32)))
svint32_t svsudot(svint32_t, svint8_t, uint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsudot_s32)))
svint32_t svsudot(svint32_t, svint8_t, svuint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsudot_lane_s32)))
svint32_t svsudot_lane(svint32_t, svint8_t, svuint8_t, uint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svusdot_n_s32)))
svint32_t svusdot(svint32_t, svuint8_t, int8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svusdot_s32)))
svint32_t svusdot(svint32_t, svuint8_t, svint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svusdot_lane_s32)))
svint32_t svusdot_lane(svint32_t, svuint8_t, svint8_t, uint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svusmmla_s32)))
svint32_t svusmmla(svint32_t, svuint8_t, svint8_t);
#endif //defined(__ARM_FEATURE_SVE_MATMUL_INT8)
#if defined(__ARM_FEATURE_SVE_BF16)
#define svcvtnt_bf16_x svcvtnt_bf16_m
#define svcvtnt_bf16_f32_x svcvtnt_bf16_f32_m
#endif /*__ARM_FEATURE_SVE_BF16 */
#if defined(__ARM_FEATURE_SVE2)
#define svcvtnt_f16_x svcvtnt_f16_m
#define svcvtnt_f16_f32_x svcvtnt_f16_f32_m
#define svcvtnt_f32_x svcvtnt_f32_m
@@ -24032,8 +23884,6 @@ svint32_t svusmmla(svint32_t, svuint8_t, svint8_t);
#define svcvtxnt_f32_x svcvtxnt_f32_m
#define svcvtxnt_f32_f64_x svcvtxnt_f32_f64_m
#endif /*__ARM_FEATURE_SVE2 */
#ifdef __cplusplus
} // extern "C"
#endif
@@ -24042,6 +23892,4 @@ svint32_t svusmmla(svint32_t, svuint8_t, svint8_t);
#undef __aio
#endif /*__ARM_FEATURE_SVE */
#endif /* __ARM_SVE_H */
+18 -15
View File
@@ -10,12 +10,14 @@
#error "Never use <avx512bf16intrin.h> directly; include <immintrin.h> instead."
#endif
#ifdef __SSE2__
#ifndef __AVX512BF16INTRIN_H
#define __AVX512BF16INTRIN_H
typedef short __m512bh __attribute__((__vector_size__(64), __aligned__(64)));
typedef short __m256bh __attribute__((__vector_size__(32), __aligned__(32)));
typedef unsigned short __bfloat16;
typedef __bf16 __v32bf __attribute__((__vector_size__(64), __aligned__(64)));
typedef __bf16 __m512bh __attribute__((__vector_size__(64), __aligned__(64)));
typedef __bf16 __bfloat16 __attribute__((deprecated("use __bf16 instead")));
#define __DEFAULT_FN_ATTRS512 \
__attribute__((__always_inline__, __nodebug__, __target__("avx512bf16"), \
@@ -33,7 +35,7 @@ typedef unsigned short __bfloat16;
/// A bfloat data.
/// \returns A float data whose sign field and exponent field keep unchanged,
/// and fraction field is extended to 23 bits.
static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bfloat16 __A) {
static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bf16 __A) {
return __builtin_ia32_cvtsbf162ss_32(__A);
}
@@ -74,9 +76,9 @@ _mm512_cvtne2ps_pbh(__m512 __A, __m512 __B) {
/// conversion of __B, and higher 256 bits come from conversion of __A.
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
_mm512_mask_cvtne2ps_pbh(__m512bh __W, __mmask32 __U, __m512 __A, __m512 __B) {
return (__m512bh)__builtin_ia32_selectw_512((__mmask32)__U,
(__v32hi)_mm512_cvtne2ps_pbh(__A, __B),
(__v32hi)__W);
return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
(__v32bf)_mm512_cvtne2ps_pbh(__A, __B),
(__v32bf)__W);
}
/// Convert Two Packed Single Data to One Packed BF16 Data.
@@ -96,9 +98,9 @@ _mm512_mask_cvtne2ps_pbh(__m512bh __W, __mmask32 __U, __m512 __A, __m512 __B) {
/// conversion of __B, and higher 256 bits come from conversion of __A.
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtne2ps_pbh(__mmask32 __U, __m512 __A, __m512 __B) {
return (__m512bh)__builtin_ia32_selectw_512((__mmask32)__U,
(__v32hi)_mm512_cvtne2ps_pbh(__A, __B),
(__v32hi)_mm512_setzero_si512());
return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
(__v32bf)_mm512_cvtne2ps_pbh(__A, __B),
(__v32bf)_mm512_setzero_si512());
}
/// Convert Packed Single Data to Packed BF16 Data.
@@ -113,7 +115,7 @@ _mm512_maskz_cvtne2ps_pbh(__mmask32 __U, __m512 __A, __m512 __B) {
static __inline__ __m256bh __DEFAULT_FN_ATTRS512
_mm512_cvtneps_pbh(__m512 __A) {
return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
(__v16hi)_mm256_undefined_si256(),
(__v16bf)_mm256_undefined_si256(),
(__mmask16)-1);
}
@@ -134,7 +136,7 @@ _mm512_cvtneps_pbh(__m512 __A) {
static __inline__ __m256bh __DEFAULT_FN_ATTRS512
_mm512_mask_cvtneps_pbh(__m256bh __W, __mmask16 __U, __m512 __A) {
return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
(__v16hi)__W,
(__v16bf)__W,
(__mmask16)__U);
}
@@ -153,7 +155,7 @@ _mm512_mask_cvtneps_pbh(__m256bh __W, __mmask16 __U, __m512 __A) {
static __inline__ __m256bh __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtneps_pbh(__mmask16 __U, __m512 __A) {
return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
(__v16hi)_mm256_setzero_si256(),
(__v16bf)_mm256_setzero_si256(),
(__mmask16)__U);
}
@@ -174,8 +176,8 @@ _mm512_maskz_cvtneps_pbh(__mmask16 __U, __m512 __A) {
static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_dpbf16_ps(__m512 __D, __m512bh __A, __m512bh __B) {
return (__m512)__builtin_ia32_dpbf16ps_512((__v16sf) __D,
(__v16si) __A,
(__v16si) __B);
(__v32bf) __A,
(__v32bf) __B);
}
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
@@ -277,3 +279,4 @@ _mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) {
#undef __DEFAULT_FN_ATTRS512
#endif
#endif
+2 -2
View File
@@ -256,8 +256,8 @@ _mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
static __inline __m512 __DEFAULT_FN_ATTRS512
_mm512_setzero_ps(void)
{
return __extension__ (__m512){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
return __extension__ (__m512){ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
}
#define _mm512_setzero _mm512_setzero_ps
+6 -9
View File
@@ -10,6 +10,8 @@
#error "Never use <avx512fp16intrin.h> directly; include <immintrin.h> instead."
#endif
#ifdef __SSE2__
#ifndef __AVX512FP16INTRIN_H
#define __AVX512FP16INTRIN_H
@@ -17,12 +19,6 @@
typedef _Float16 __v32hf __attribute__((__vector_size__(64), __aligned__(64)));
typedef _Float16 __m512h __attribute__((__vector_size__(64), __aligned__(64)));
typedef _Float16 __m512h_u __attribute__((__vector_size__(64), __aligned__(1)));
typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16)));
typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1)));
typedef _Float16 __v16hf __attribute__((__vector_size__(32), __aligned__(32)));
typedef _Float16 __m256h __attribute__((__vector_size__(32), __aligned__(32)));
typedef _Float16 __m256h_u __attribute__((__vector_size__(32), __aligned__(1)));
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS512 \
@@ -829,7 +825,7 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_sh(void const *__dp) {
struct __mm_load_sh_struct {
_Float16 __u;
} __attribute__((__packed__, __may_alias__));
_Float16 __u = ((struct __mm_load_sh_struct *)__dp)->__u;
_Float16 __u = ((const struct __mm_load_sh_struct *)__dp)->__u;
return (__m128h){__u, 0, 0, 0, 0, 0, 0, 0};
}
@@ -838,13 +834,13 @@ _mm_mask_load_sh(__m128h __W, __mmask8 __U, const void *__A) {
__m128h src = (__v8hf)__builtin_shufflevector(
(__v8hf)__W, (__v8hf)_mm_setzero_ph(), 0, 8, 8, 8, 8, 8, 8, 8);
return (__m128h)__builtin_ia32_loadsh128_mask((__v8hf *)__A, src, __U & 1);
return (__m128h)__builtin_ia32_loadsh128_mask((const __v8hf *)__A, src, __U & 1);
}
static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_load_sh(__mmask8 __U, const void *__A) {
return (__m128h)__builtin_ia32_loadsh128_mask(
(__v8hf *)__A, (__v8hf)_mm_setzero_ph(), __U & 1);
(const __v8hf *)__A, (__v8hf)_mm_setzero_ph(), __U & 1);
}
static __inline__ __m512h __DEFAULT_FN_ATTRS512
@@ -3347,3 +3343,4 @@ _mm512_permutexvar_ph(__m512i __A, __m512h __B) {
#undef __DEFAULT_FN_ATTRS512
#endif
#endif
+13 -27
View File
@@ -18,14 +18,21 @@
#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma,avx512vl"), __min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma,avx512vl"), __min_vector_width__(256)))
#define _mm_madd52hi_epu64(X, Y, Z) \
((__m128i)__builtin_ia32_vpmadd52huq128((__v2di)(X), (__v2di)(Y), \
(__v2di)(Z)))
#define _mm256_madd52hi_epu64(X, Y, Z) \
((__m256i)__builtin_ia32_vpmadd52huq256((__v4di)(X), (__v4di)(Y), \
(__v4di)(Z)))
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_madd52hi_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
{
return (__m128i)__builtin_ia32_vpmadd52huq128((__v2di) __X, (__v2di) __Y,
(__v2di) __Z);
}
#define _mm_madd52lo_epu64(X, Y, Z) \
((__m128i)__builtin_ia32_vpmadd52luq128((__v2di)(X), (__v2di)(Y), \
(__v2di)(Z)))
#define _mm256_madd52lo_epu64(X, Y, Z) \
((__m256i)__builtin_ia32_vpmadd52luq256((__v4di)(X), (__v4di)(Y), \
(__v4di)(Z)))
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_madd52hi_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
@@ -43,13 +50,6 @@ _mm_maskz_madd52hi_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
(__v2di)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_madd52hi_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
{
return (__m256i)__builtin_ia32_vpmadd52huq256((__v4di)__X, (__v4di)__Y,
(__v4di)__Z);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_madd52hi_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
{
@@ -66,13 +66,6 @@ _mm256_maskz_madd52hi_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z
(__v4di)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_madd52lo_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
{
return (__m128i)__builtin_ia32_vpmadd52luq128((__v2di)__X, (__v2di)__Y,
(__v2di)__Z);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_madd52lo_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
{
@@ -89,13 +82,6 @@ _mm_maskz_madd52lo_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
(__v2di)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_madd52lo_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
{
return (__m256i)__builtin_ia32_vpmadd52luq256((__v4di)__X, (__v4di)__Y,
(__v4di)__Z);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_madd52lo_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
{
+31 -38
View File
@@ -10,11 +10,11 @@
#error "Never use <avx512vlbf16intrin.h> directly; include <immintrin.h> instead."
#endif
#ifdef __SSE2__
#ifndef __AVX512VLBF16INTRIN_H
#define __AVX512VLBF16INTRIN_H
typedef short __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
#define __DEFAULT_FN_ATTRS128 \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512vl, avx512bf16"), __min_vector_width__(128)))
@@ -59,9 +59,9 @@ _mm_cvtne2ps_pbh(__m128 __A, __m128 __B) {
/// conversion of __B, and higher 64 bits come from conversion of __A.
static __inline__ __m128bh __DEFAULT_FN_ATTRS128
_mm_mask_cvtne2ps_pbh(__m128bh __W, __mmask8 __U, __m128 __A, __m128 __B) {
return (__m128bh)__builtin_ia32_selectw_128((__mmask8)__U,
(__v8hi)_mm_cvtne2ps_pbh(__A, __B),
(__v8hi)__W);
return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U,
(__v8bf)_mm_cvtne2ps_pbh(__A, __B),
(__v8bf)__W);
}
/// Convert Two Packed Single Data to One Packed BF16 Data.
@@ -81,9 +81,9 @@ _mm_mask_cvtne2ps_pbh(__m128bh __W, __mmask8 __U, __m128 __A, __m128 __B) {
/// conversion of __B, and higher 64 bits come from conversion of __A.
static __inline__ __m128bh __DEFAULT_FN_ATTRS128
_mm_maskz_cvtne2ps_pbh(__mmask8 __U, __m128 __A, __m128 __B) {
return (__m128bh)__builtin_ia32_selectw_128((__mmask8)__U,
(__v8hi)_mm_cvtne2ps_pbh(__A, __B),
(__v8hi)_mm_setzero_si128());
return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U,
(__v8bf)_mm_cvtne2ps_pbh(__A, __B),
(__v8bf)_mm_setzero_si128());
}
/// Convert Two Packed Single Data to One Packed BF16 Data.
@@ -123,9 +123,9 @@ _mm256_cvtne2ps_pbh(__m256 __A, __m256 __B) {
/// conversion of __B, and higher 128 bits come from conversion of __A.
static __inline__ __m256bh __DEFAULT_FN_ATTRS256
_mm256_mask_cvtne2ps_pbh(__m256bh __W, __mmask16 __U, __m256 __A, __m256 __B) {
return (__m256bh)__builtin_ia32_selectw_256((__mmask16)__U,
(__v16hi)_mm256_cvtne2ps_pbh(__A, __B),
(__v16hi)__W);
return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U,
(__v16bf)_mm256_cvtne2ps_pbh(__A, __B),
(__v16bf)__W);
}
/// Convert Two Packed Single Data to One Packed BF16 Data.
@@ -145,9 +145,9 @@ _mm256_mask_cvtne2ps_pbh(__m256bh __W, __mmask16 __U, __m256 __A, __m256 __B) {
/// conversion of __B, and higher 128 bits come from conversion of __A.
static __inline__ __m256bh __DEFAULT_FN_ATTRS256
_mm256_maskz_cvtne2ps_pbh(__mmask16 __U, __m256 __A, __m256 __B) {
return (__m256bh)__builtin_ia32_selectw_256((__mmask16)__U,
(__v16hi)_mm256_cvtne2ps_pbh(__A, __B),
(__v16hi)_mm256_setzero_si256());
return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U,
(__v16bf)_mm256_cvtne2ps_pbh(__A, __B),
(__v16bf)_mm256_setzero_si256());
}
/// Convert Packed Single Data to Packed BF16 Data.
@@ -160,12 +160,8 @@ _mm256_maskz_cvtne2ps_pbh(__mmask16 __U, __m256 __A, __m256 __B) {
/// A 128-bit vector of [4 x float].
/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
/// conversion of __A, and higher 64 bits are 0.
static __inline__ __m128bh __DEFAULT_FN_ATTRS128
_mm_cvtneps_pbh(__m128 __A) {
return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
(__v8hi)_mm_undefined_si128(),
(__mmask8)-1);
}
#define _mm_cvtneps_pbh(A) \
((__m128bh)__builtin_ia32_vcvtneps2bf16128((__v4sf)(A)))
/// Convert Packed Single Data to Packed BF16 Data.
///
@@ -185,7 +181,7 @@ _mm_cvtneps_pbh(__m128 __A) {
static __inline__ __m128bh __DEFAULT_FN_ATTRS128
_mm_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m128 __A) {
return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
(__v8hi)__W,
(__v8bf)__W,
(__mmask8)__U);
}
@@ -205,7 +201,7 @@ _mm_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m128 __A) {
static __inline__ __m128bh __DEFAULT_FN_ATTRS128
_mm_maskz_cvtneps_pbh(__mmask8 __U, __m128 __A) {
return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
(__v8hi)_mm_setzero_si128(),
(__v8bf)_mm_setzero_si128(),
(__mmask8)__U);
}
@@ -218,12 +214,8 @@ _mm_maskz_cvtneps_pbh(__mmask8 __U, __m128 __A) {
/// \param __A
/// A 256-bit vector of [8 x float].
/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
static __inline__ __m128bh __DEFAULT_FN_ATTRS256
_mm256_cvtneps_pbh(__m256 __A) {
return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
(__v8hi)_mm_undefined_si128(),
(__mmask8)-1);
}
#define _mm256_cvtneps_pbh(A) \
((__m128bh)__builtin_ia32_vcvtneps2bf16256((__v8sf)(A)))
/// Convert Packed Single Data to Packed BF16 Data.
///
@@ -242,7 +234,7 @@ _mm256_cvtneps_pbh(__m256 __A) {
static __inline__ __m128bh __DEFAULT_FN_ATTRS256
_mm256_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m256 __A) {
return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
(__v8hi)__W,
(__v8bf)__W,
(__mmask8)__U);
}
@@ -261,7 +253,7 @@ _mm256_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m256 __A) {
static __inline__ __m128bh __DEFAULT_FN_ATTRS256
_mm256_maskz_cvtneps_pbh(__mmask8 __U, __m256 __A) {
return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
(__v8hi)_mm_setzero_si128(),
(__v8bf)_mm_setzero_si128(),
(__mmask8)__U);
}
@@ -282,8 +274,8 @@ _mm256_maskz_cvtneps_pbh(__mmask8 __U, __m256 __A) {
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_dpbf16_ps(__m128 __D, __m128bh __A, __m128bh __B) {
return (__m128)__builtin_ia32_dpbf16ps_128((__v4sf)__D,
(__v4si)__A,
(__v4si)__B);
(__v8bf)__A,
(__v8bf)__B);
}
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
@@ -351,8 +343,8 @@ _mm_maskz_dpbf16_ps(__mmask8 __U, __m128 __D, __m128bh __A, __m128bh __B) {
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_dpbf16_ps(__m256 __D, __m256bh __A, __m256bh __B) {
return (__m256)__builtin_ia32_dpbf16ps_256((__v8sf)__D,
(__v8si)__A,
(__v8si)__B);
(__v16bf)__A,
(__v16bf)__B);
}
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
@@ -413,11 +405,11 @@ _mm256_maskz_dpbf16_ps(__mmask8 __U, __m256 __D, __m256bh __A, __m256bh __B) {
/// A float data.
/// \returns A bf16 data whose sign field and exponent field keep unchanged,
/// and fraction field is truncated to 7 bits.
static __inline__ __bfloat16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) {
static __inline__ __bf16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) {
__v4sf __V = {__A, 0, 0, 0};
__v8hi __R = __builtin_ia32_cvtneps2bf16_128_mask(
(__v4sf)__V, (__v8hi)_mm_undefined_si128(), (__mmask8)-1);
return (__bfloat16)__R[0];
__v8bf __R = __builtin_ia32_cvtneps2bf16_128_mask(
(__v4sf)__V, (__v8bf)_mm_undefined_si128(), (__mmask8)-1);
return (__bf16)__R[0];
}
/// Convert Packed BF16 Data to Packed float Data.
@@ -520,3 +512,4 @@ _mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) {
#undef __DEFAULT_FN_ATTRS256
#endif
#endif
+352
View File
@@ -2803,6 +2803,358 @@ _mm256_mask_permutexvar_epi16 (__m256i __W, __mmask16 __M, __m256i __A,
(__v16hi)_mm256_dbsad_epu8((A), (B), (imm)), \
(__v16hi)_mm256_setzero_si256()))
static __inline__ short __DEFAULT_FN_ATTRS128
_mm_reduce_add_epi16(__m128i __W) {
return __builtin_reduce_add((__v8hi)__W);
}
static __inline__ short __DEFAULT_FN_ATTRS128
_mm_reduce_mul_epi16(__m128i __W) {
return __builtin_reduce_mul((__v8hi)__W);
}
static __inline__ short __DEFAULT_FN_ATTRS128
_mm_reduce_and_epi16(__m128i __W) {
return __builtin_reduce_and((__v8hi)__W);
}
static __inline__ short __DEFAULT_FN_ATTRS128
_mm_reduce_or_epi16(__m128i __W) {
return __builtin_reduce_or((__v8hi)__W);
}
static __inline__ short __DEFAULT_FN_ATTRS128
_mm_mask_reduce_add_epi16( __mmask8 __M, __m128i __W) {
__W = _mm_maskz_mov_epi16(__M, __W);
return __builtin_reduce_add((__v8hi)__W);
}
static __inline__ short __DEFAULT_FN_ATTRS128
_mm_mask_reduce_mul_epi16( __mmask8 __M, __m128i __W) {
__W = _mm_mask_mov_epi16(_mm_set1_epi16(1), __M, __W);
return __builtin_reduce_mul((__v8hi)__W);
}
static __inline__ short __DEFAULT_FN_ATTRS128
_mm_mask_reduce_and_epi16( __mmask8 __M, __m128i __W) {
__W = _mm_mask_mov_epi16(_mm_set1_epi16(-1), __M, __W);
return __builtin_reduce_and((__v8hi)__W);
}
static __inline__ short __DEFAULT_FN_ATTRS128
_mm_mask_reduce_or_epi16(__mmask8 __M, __m128i __W) {
__W = _mm_maskz_mov_epi16(__M, __W);
return __builtin_reduce_or((__v8hi)__W);
}
static __inline__ short __DEFAULT_FN_ATTRS128
_mm_reduce_max_epi16(__m128i __V) {
return __builtin_reduce_max((__v8hi)__V);
}
static __inline__ unsigned short __DEFAULT_FN_ATTRS128
_mm_reduce_max_epu16(__m128i __V) {
return __builtin_reduce_max((__v8hu)__V);
}
static __inline__ short __DEFAULT_FN_ATTRS128
_mm_reduce_min_epi16(__m128i __V) {
return __builtin_reduce_min((__v8hi)__V);
}
static __inline__ unsigned short __DEFAULT_FN_ATTRS128
_mm_reduce_min_epu16(__m128i __V) {
return __builtin_reduce_min((__v8hu)__V);
}
static __inline__ short __DEFAULT_FN_ATTRS128
_mm_mask_reduce_max_epi16(__mmask16 __M, __m128i __V) {
__V = _mm_mask_mov_epi16(_mm_set1_epi16(-32767-1), __M, __V);
return __builtin_reduce_max((__v8hi)__V);
}
static __inline__ unsigned short __DEFAULT_FN_ATTRS128
_mm_mask_reduce_max_epu16(__mmask16 __M, __m128i __V) {
__V = _mm_maskz_mov_epi16(__M, __V);
return __builtin_reduce_max((__v8hu)__V);
}
static __inline__ short __DEFAULT_FN_ATTRS128
_mm_mask_reduce_min_epi16(__mmask16 __M, __m128i __V) {
__V = _mm_mask_mov_epi16(_mm_set1_epi16(32767), __M, __V);
return __builtin_reduce_min((__v8hi)__V);
}
static __inline__ unsigned short __DEFAULT_FN_ATTRS128
_mm_mask_reduce_min_epu16(__mmask16 __M, __m128i __V) {
__V = _mm_mask_mov_epi16(_mm_set1_epi16(-1), __M, __V);
return __builtin_reduce_min((__v8hu)__V);
}
static __inline__ short __DEFAULT_FN_ATTRS256
_mm256_reduce_add_epi16(__m256i __W) {
return __builtin_reduce_add((__v16hi)__W);
}
static __inline__ short __DEFAULT_FN_ATTRS256
_mm256_reduce_mul_epi16(__m256i __W) {
return __builtin_reduce_mul((__v16hi)__W);
}
static __inline__ short __DEFAULT_FN_ATTRS256
_mm256_reduce_and_epi16(__m256i __W) {
return __builtin_reduce_and((__v16hi)__W);
}
static __inline__ short __DEFAULT_FN_ATTRS256
_mm256_reduce_or_epi16(__m256i __W) {
return __builtin_reduce_or((__v16hi)__W);
}
static __inline__ short __DEFAULT_FN_ATTRS256
_mm256_mask_reduce_add_epi16( __mmask16 __M, __m256i __W) {
__W = _mm256_maskz_mov_epi16(__M, __W);
return __builtin_reduce_add((__v16hi)__W);
}
static __inline__ short __DEFAULT_FN_ATTRS256
_mm256_mask_reduce_mul_epi16( __mmask16 __M, __m256i __W) {
__W = _mm256_mask_mov_epi16(_mm256_set1_epi16(1), __M, __W);
return __builtin_reduce_mul((__v16hi)__W);
}
static __inline__ short __DEFAULT_FN_ATTRS256
_mm256_mask_reduce_and_epi16( __mmask16 __M, __m256i __W) {
__W = _mm256_mask_mov_epi16(_mm256_set1_epi16(-1), __M, __W);
return __builtin_reduce_and((__v16hi)__W);
}
static __inline__ short __DEFAULT_FN_ATTRS256
_mm256_mask_reduce_or_epi16(__mmask16 __M, __m256i __W) {
__W = _mm256_maskz_mov_epi16(__M, __W);
return __builtin_reduce_or((__v16hi)__W);
}
static __inline__ short __DEFAULT_FN_ATTRS256
_mm256_reduce_max_epi16(__m256i __V) {
return __builtin_reduce_max((__v16hi)__V);
}
static __inline__ unsigned short __DEFAULT_FN_ATTRS256
_mm256_reduce_max_epu16(__m256i __V) {
return __builtin_reduce_max((__v16hu)__V);
}
static __inline__ short __DEFAULT_FN_ATTRS256
_mm256_reduce_min_epi16(__m256i __V) {
return __builtin_reduce_min((__v16hi)__V);
}
static __inline__ unsigned short __DEFAULT_FN_ATTRS256
_mm256_reduce_min_epu16(__m256i __V) {
return __builtin_reduce_min((__v16hu)__V);
}
static __inline__ short __DEFAULT_FN_ATTRS256
_mm256_mask_reduce_max_epi16(__mmask16 __M, __m256i __V) {
__V = _mm256_mask_mov_epi16(_mm256_set1_epi16(-32767-1), __M, __V);
return __builtin_reduce_max((__v16hi)__V);
}
static __inline__ unsigned short __DEFAULT_FN_ATTRS256
_mm256_mask_reduce_max_epu16(__mmask16 __M, __m256i __V) {
__V = _mm256_maskz_mov_epi16(__M, __V);
return __builtin_reduce_max((__v16hu)__V);
}
static __inline__ short __DEFAULT_FN_ATTRS256
_mm256_mask_reduce_min_epi16(__mmask16 __M, __m256i __V) {
__V = _mm256_mask_mov_epi16(_mm256_set1_epi16(32767), __M, __V);
return __builtin_reduce_min((__v16hi)__V);
}
static __inline__ unsigned short __DEFAULT_FN_ATTRS256
_mm256_mask_reduce_min_epu16(__mmask16 __M, __m256i __V) {
__V = _mm256_mask_mov_epi16(_mm256_set1_epi16(-1), __M, __V);
return __builtin_reduce_min((__v16hu)__V);
}
static __inline__ signed char __DEFAULT_FN_ATTRS128
_mm_reduce_add_epi8(__m128i __W) {
return __builtin_reduce_add((__v16qs)__W);
}
static __inline__ signed char __DEFAULT_FN_ATTRS128
_mm_reduce_mul_epi8(__m128i __W) {
return __builtin_reduce_mul((__v16qs)__W);
}
static __inline__ signed char __DEFAULT_FN_ATTRS128
_mm_reduce_and_epi8(__m128i __W) {
return __builtin_reduce_and((__v16qs)__W);
}
static __inline__ signed char __DEFAULT_FN_ATTRS128
_mm_reduce_or_epi8(__m128i __W) {
return __builtin_reduce_or((__v16qs)__W);
}
static __inline__ signed char __DEFAULT_FN_ATTRS128
_mm_mask_reduce_add_epi8(__mmask16 __M, __m128i __W) {
__W = _mm_maskz_mov_epi8(__M, __W);
return __builtin_reduce_add((__v16qs)__W);
}
static __inline__ signed char __DEFAULT_FN_ATTRS128
_mm_mask_reduce_mul_epi8(__mmask16 __M, __m128i __W) {
__W = _mm_mask_mov_epi8(_mm_set1_epi8(1), __M, __W);
return __builtin_reduce_mul((__v16qs)__W);
}
static __inline__ signed char __DEFAULT_FN_ATTRS128
_mm_mask_reduce_and_epi8(__mmask16 __M, __m128i __W) {
__W = _mm_mask_mov_epi8(_mm_set1_epi8(-1), __M, __W);
return __builtin_reduce_and((__v16qs)__W);
}
static __inline__ signed char __DEFAULT_FN_ATTRS128
_mm_mask_reduce_or_epi8(__mmask16 __M, __m128i __W) {
__W = _mm_maskz_mov_epi8(__M, __W);
return __builtin_reduce_or((__v16qs)__W);
}
static __inline__ signed char __DEFAULT_FN_ATTRS128
_mm_reduce_max_epi8(__m128i __V) {
return __builtin_reduce_max((__v16qs)__V);
}
static __inline__ unsigned char __DEFAULT_FN_ATTRS128
_mm_reduce_max_epu8(__m128i __V) {
return __builtin_reduce_max((__v16qu)__V);
}
static __inline__ signed char __DEFAULT_FN_ATTRS128
_mm_reduce_min_epi8(__m128i __V) {
return __builtin_reduce_min((__v16qs)__V);
}
static __inline__ unsigned char __DEFAULT_FN_ATTRS128
_mm_reduce_min_epu8(__m128i __V) {
return __builtin_reduce_min((__v16qu)__V);
}
static __inline__ signed char __DEFAULT_FN_ATTRS128
_mm_mask_reduce_max_epi8(__mmask16 __M, __m128i __V) {
__V = _mm_mask_mov_epi8(_mm_set1_epi8(-127-1), __M, __V);
return __builtin_reduce_max((__v16qs)__V);
}
static __inline__ unsigned char __DEFAULT_FN_ATTRS128
_mm_mask_reduce_max_epu8(__mmask16 __M, __m128i __V) {
__V = _mm_maskz_mov_epi8(__M, __V);
return __builtin_reduce_max((__v16qu)__V);
}
static __inline__ signed char __DEFAULT_FN_ATTRS128
_mm_mask_reduce_min_epi8(__mmask16 __M, __m128i __V) {
__V = _mm_mask_mov_epi8(_mm_set1_epi8(127), __M, __V);
return __builtin_reduce_min((__v16qs)__V);
}
static __inline__ unsigned char __DEFAULT_FN_ATTRS128
_mm_mask_reduce_min_epu8(__mmask16 __M, __m128i __V) {
__V = _mm_mask_mov_epi8(_mm_set1_epi8(-1), __M, __V);
return __builtin_reduce_min((__v16qu)__V);
}
static __inline__ signed char __DEFAULT_FN_ATTRS256
_mm256_reduce_add_epi8(__m256i __W) {
return __builtin_reduce_add((__v32qs)__W);
}
static __inline__ signed char __DEFAULT_FN_ATTRS256
_mm256_reduce_mul_epi8(__m256i __W) {
return __builtin_reduce_mul((__v32qs)__W);
}
static __inline__ signed char __DEFAULT_FN_ATTRS256
_mm256_reduce_and_epi8(__m256i __W) {
return __builtin_reduce_and((__v32qs)__W);
}
static __inline__ signed char __DEFAULT_FN_ATTRS256
_mm256_reduce_or_epi8(__m256i __W) {
return __builtin_reduce_or((__v32qs)__W);
}
static __inline__ signed char __DEFAULT_FN_ATTRS256
_mm256_mask_reduce_add_epi8(__mmask32 __M, __m256i __W) {
__W = _mm256_maskz_mov_epi8(__M, __W);
return __builtin_reduce_add((__v32qs)__W);
}
static __inline__ signed char __DEFAULT_FN_ATTRS256
_mm256_mask_reduce_mul_epi8(__mmask32 __M, __m256i __W) {
__W = _mm256_mask_mov_epi8(_mm256_set1_epi8(1), __M, __W);
return __builtin_reduce_mul((__v32qs)__W);
}
static __inline__ signed char __DEFAULT_FN_ATTRS256
_mm256_mask_reduce_and_epi8(__mmask32 __M, __m256i __W) {
__W = _mm256_mask_mov_epi8(_mm256_set1_epi8(-1), __M, __W);
return __builtin_reduce_and((__v32qs)__W);
}
static __inline__ signed char __DEFAULT_FN_ATTRS256
_mm256_mask_reduce_or_epi8(__mmask32 __M, __m256i __W) {
__W = _mm256_maskz_mov_epi8(__M, __W);
return __builtin_reduce_or((__v32qs)__W);
}
static __inline__ signed char __DEFAULT_FN_ATTRS256
_mm256_reduce_max_epi8(__m256i __V) {
return __builtin_reduce_max((__v32qs)__V);
}
static __inline__ unsigned char __DEFAULT_FN_ATTRS256
_mm256_reduce_max_epu8(__m256i __V) {
return __builtin_reduce_max((__v32qu)__V);
}
static __inline__ signed char __DEFAULT_FN_ATTRS256
_mm256_reduce_min_epi8(__m256i __V) {
return __builtin_reduce_min((__v32qs)__V);
}
static __inline__ unsigned char __DEFAULT_FN_ATTRS256
_mm256_reduce_min_epu8(__m256i __V) {
return __builtin_reduce_min((__v32qu)__V);
}
static __inline__ signed char __DEFAULT_FN_ATTRS256
_mm256_mask_reduce_max_epi8(__mmask32 __M, __m256i __V) {
__V = _mm256_mask_mov_epi8(_mm256_set1_epi8(-127-1), __M, __V);
return __builtin_reduce_max((__v32qs)__V);
}
static __inline__ unsigned char __DEFAULT_FN_ATTRS256
_mm256_mask_reduce_max_epu8(__mmask32 __M, __m256i __V) {
__V = _mm256_maskz_mov_epi8(__M, __V);
return __builtin_reduce_max((__v32qu)__V);
}
static __inline__ signed char __DEFAULT_FN_ATTRS256
_mm256_mask_reduce_min_epi8(__mmask32 __M, __m256i __V) {
__V = _mm256_mask_mov_epi8(_mm256_set1_epi8(127), __M, __V);
return __builtin_reduce_min((__v32qs)__V);
}
static __inline__ unsigned char __DEFAULT_FN_ATTRS256
_mm256_mask_reduce_min_epu8(__mmask32 __M, __m256i __V) {
__V = _mm256_mask_mov_epi8(_mm256_set1_epi8(-1), __M, __V);
return __builtin_reduce_min((__v32qu)__V);
}
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
+3
View File
@@ -11,6 +11,8 @@
"Never use <avx512vlfp16intrin.h> directly; include <immintrin.h> instead."
#endif
#ifdef __SSE2__
#ifndef __AVX512VLFP16INTRIN_H
#define __AVX512VLFP16INTRIN_H
@@ -2066,3 +2068,4 @@ _mm_reduce_min_ph(__m128h __V) {
#undef __DEFAULT_FN_ATTRS256
#endif
#endif
+177
View File
@@ -0,0 +1,177 @@
/*===----------------- avxifmaintrin.h - IFMA intrinsics -------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avxifmaintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVXIFMAINTRIN_H
#define __AVXIFMAINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS128 \
__attribute__((__always_inline__, __nodebug__, __target__("avxifma"), \
__min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 \
__attribute__((__always_inline__, __nodebug__, __target__("avxifma"), \
__min_vector_width__(256)))
// must vex-encoding
/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
/// and \a __Z to form a 104-bit intermediate result. Add the high 52-bit
/// unsigned integer from the intermediate result with the corresponding
/// unsigned 64-bit integer in \a __X, and store the results in \a dst.
///
/// \headerfile <immintrin.h>
///
/// \code
/// __m128i
/// _mm_madd52hi_avx_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
/// \endcode
///
/// This intrinsic corresponds to the \c VPMADD52HUQ instruction.
///
/// \return
/// return __m128i dst.
/// \param __X
/// A 128-bit vector of [2 x i64]
/// \param __Y
/// A 128-bit vector of [2 x i64]
/// \param __Z
/// A 128-bit vector of [2 x i64]
///
/// \code{.operation}
/// FOR j := 0 to 1
/// i := j*64
/// tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
/// dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[103:52])
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_madd52hi_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
return (__m128i)__builtin_ia32_vpmadd52huq128((__v2di)__X, (__v2di)__Y,
(__v2di)__Z);
}
/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
/// and \a __Z to form a 104-bit intermediate result. Add the high 52-bit
/// unsigned integer from the intermediate result with the corresponding
/// unsigned 64-bit integer in \a __X, and store the results in \a dst.
///
/// \headerfile <immintrin.h>
///
/// \code
/// __m256i
/// _mm256_madd52hi_avx_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
/// \endcode
///
/// This intrinsic corresponds to the \c VPMADD52HUQ instruction.
///
/// \return
/// return __m256i dst.
/// \param __X
/// A 256-bit vector of [4 x i64]
/// \param __Y
/// A 256-bit vector of [4 x i64]
/// \param __Z
/// A 256-bit vector of [4 x i64]
///
/// \code{.operation}
/// FOR j := 0 to 3
/// i := j*64
/// tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
/// dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[103:52])
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_madd52hi_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
return (__m256i)__builtin_ia32_vpmadd52huq256((__v4di)__X, (__v4di)__Y,
(__v4di)__Z);
}
/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
/// and \a __Z to form a 104-bit intermediate result. Add the low 52-bit
/// unsigned integer from the intermediate result with the corresponding
/// unsigned 64-bit integer in \a __X, and store the results in \a dst.
///
/// \headerfile <immintrin.h>
///
/// \code
/// __m128i
/// _mm_madd52lo_avx_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
/// \endcode
///
/// This intrinsic corresponds to the \c VPMADD52LUQ instruction.
///
/// \return
/// return __m128i dst.
/// \param __X
/// A 128-bit vector of [2 x i64]
/// \param __Y
/// A 128-bit vector of [2 x i64]
/// \param __Z
/// A 128-bit vector of [2 x i64]
///
/// \code{.operation}
/// FOR j := 0 to 1
/// i := j*64
/// tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
/// dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[51:0])
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_madd52lo_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
return (__m128i)__builtin_ia32_vpmadd52luq128((__v2di)__X, (__v2di)__Y,
(__v2di)__Z);
}
/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
/// and \a __Z to form a 104-bit intermediate result. Add the low 52-bit
/// unsigned integer from the intermediate result with the corresponding
/// unsigned 64-bit integer in \a __X, and store the results in \a dst.
///
/// \headerfile <immintrin.h>
///
/// \code
/// __m256i
/// _mm256_madd52lo_avx_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
/// \endcode
///
/// This intrinsic corresponds to the \c VPMADD52LUQ instruction.
///
/// \return
/// return __m256i dst.
/// \param __X
/// A 256-bit vector of [4 x i64]
/// \param __Y
/// A 256-bit vector of [4 x i64]
/// \param __Z
/// A 256-bit vector of [4 x i64]
///
/// \code{.operation}
/// FOR j := 0 to 3
/// i := j*64
/// tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
/// dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[51:0])
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_madd52lo_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
return (__m256i)__builtin_ia32_vpmadd52luq256((__v4di)__X, (__v4di)__Y,
(__v4di)__Z);
}
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
#endif // __AVXIFMAINTRIN_H
+12 -2
View File
@@ -39,6 +39,16 @@ typedef float __m256_u __attribute__ ((__vector_size__ (32), __aligned__(1)));
typedef double __m256d_u __attribute__((__vector_size__(32), __aligned__(1)));
typedef long long __m256i_u __attribute__((__vector_size__(32), __aligned__(1)));
#ifdef __SSE2__
/* Both _Float16 and __bf16 require SSE2 being enabled. */
typedef _Float16 __v16hf __attribute__((__vector_size__(32), __aligned__(32)));
typedef _Float16 __m256h __attribute__((__vector_size__(32), __aligned__(32)));
typedef _Float16 __m256h_u __attribute__((__vector_size__(32), __aligned__(1)));
typedef __bf16 __v16bf __attribute__((__vector_size__(32), __aligned__(32)));
typedef __bf16 __m256bh __attribute__((__vector_size__(32), __aligned__(32)));
#endif
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(256)))
#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(128)))
@@ -4288,7 +4298,7 @@ _mm256_set1_epi64x(long long __q)
static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_setzero_pd(void)
{
return __extension__ (__m256d){ 0, 0, 0, 0 };
return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 };
}
/// Constructs a 256-bit floating-point vector of [8 x float] with all
@@ -4302,7 +4312,7 @@ _mm256_setzero_pd(void)
static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_setzero_ps(void)
{
return __extension__ (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
return __extension__ (__m256){ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
}
/// Constructs a 256-bit integer vector initialized to zero.
+484
View File
@@ -0,0 +1,484 @@
/*===-------------- avxneconvertintrin.h - AVXNECONVERT --------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error \
"Never use <avxneconvertintrin.h> directly; include <immintrin.h> instead."
#endif // __IMMINTRIN_H
#ifdef __SSE2__
#ifndef __AVXNECONVERTINTRIN_H
#define __AVXNECONVERTINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS128 \
__attribute__((__always_inline__, __nodebug__, __target__("avxneconvert"), \
__min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 \
__attribute__((__always_inline__, __nodebug__, __target__("avxneconvert"), \
__min_vector_width__(256)))
/// Convert scalar BF16 (16-bit) floating-point element
/// stored at memory locations starting at location \a __A to a
/// single-precision (32-bit) floating-point, broadcast it to packed
/// single-precision (32-bit) floating-point elements, and store the results in
/// \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm_bcstnebf16_ps(const void *__A);
/// \endcode
///
/// This intrinsic corresponds to the \c VBCSTNEBF162PS instruction.
///
/// \param __A
/// A pointer to a 16-bit memory location. The address of the memory
/// location does not have to be aligned.
/// \returns
/// A 128-bit vector of [4 x float].
///
/// \code{.operation}
/// b := Convert_BF16_To_FP32(MEM[__A+15:__A])
/// FOR j := 0 to 3
/// m := j*32
/// dst[m+31:m] := b
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_bcstnebf16_ps(const void *__A) {
return (__m128)__builtin_ia32_vbcstnebf162ps128((const __bf16 *)__A);
}
/// Convert scalar BF16 (16-bit) floating-point element
/// stored at memory locations starting at location \a __A to a
/// single-precision (32-bit) floating-point, broadcast it to packed
/// single-precision (32-bit) floating-point elements, and store the results in
/// \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm256_bcstnebf16_ps(const void *__A);
/// \endcode
///
/// This intrinsic corresponds to the \c VBCSTNEBF162PS instruction.
///
/// \param __A
/// A pointer to a 16-bit memory location. The address of the memory
/// location does not have to be aligned.
/// \returns
/// A 256-bit vector of [8 x float].
///
/// \code{.operation}
/// b := Convert_BF16_To_FP32(MEM[__A+15:__A])
/// FOR j := 0 to 7
/// m := j*32
/// dst[m+31:m] := b
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_bcstnebf16_ps(const void *__A) {
return (__m256)__builtin_ia32_vbcstnebf162ps256((const __bf16 *)__A);
}
/// Convert scalar half-precision (16-bit) floating-point element
/// stored at memory locations starting at location \a __A to a
/// single-precision (32-bit) floating-point, broadcast it to packed
/// single-precision (32-bit) floating-point elements, and store the results in
/// \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm_bcstnesh_ps(const void *__A);
/// \endcode
///
/// This intrinsic corresponds to the \c VBCSTNESH2PS instruction.
///
/// \param __A
/// A pointer to a 16-bit memory location. The address of the memory
/// location does not have to be aligned.
/// \returns
/// A 128-bit vector of [4 x float].
///
/// \code{.operation}
/// b := Convert_FP16_To_FP32(MEM[__A+15:__A])
/// FOR j := 0 to 3
/// m := j*32
/// dst[m+31:m] := b
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_bcstnesh_ps(const void *__A) {
return (__m128)__builtin_ia32_vbcstnesh2ps128((const _Float16 *)__A);
}
/// Convert scalar half-precision (16-bit) floating-point element
/// stored at memory locations starting at location \a __A to a
/// single-precision (32-bit) floating-point, broadcast it to packed
/// single-precision (32-bit) floating-point elements, and store the results in
/// \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm256_bcstnesh_ps(const void *__A);
/// \endcode
///
/// This intrinsic corresponds to the \c VBCSTNESH2PS instruction.
///
/// \param __A
/// A pointer to a 16-bit memory location. The address of the memory
/// location does not have to be aligned.
/// \returns
/// A 256-bit vector of [8 x float].
///
/// \code{.operation}
/// b := Convert_FP16_To_FP32(MEM[__A+15:__A])
/// FOR j := 0 to 7
/// m := j*32
/// dst[m+31:m] := b
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_bcstnesh_ps(const void *__A) {
return (__m256)__builtin_ia32_vbcstnesh2ps256((const _Float16 *)__A);
}
/// Convert packed BF16 (16-bit) floating-point even-indexed elements
/// stored at memory locations starting at location \a __A to packed
/// single-precision (32-bit) floating-point elements, and store the results in
/// \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm_cvtneebf16_ps(const __m128bh *__A);
/// \endcode
///
/// This intrinsic corresponds to the \c VCVTNEEBF162PS instruction.
///
/// \param __A
/// A pointer to a 128-bit memory location containing 8 consecutive
/// BF16 (16-bit) floating-point values.
/// \returns
/// A 128-bit vector of [4 x float].
///
/// \code{.operation}
/// FOR j := 0 to 3
/// k := j*2
/// i := k*16
/// m := j*32
/// dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_cvtneebf16_ps(const __m128bh *__A) {
return (__m128)__builtin_ia32_vcvtneebf162ps128((const __v8bf *)__A);
}
/// Convert packed BF16 (16-bit) floating-point even-indexed elements
/// stored at memory locations starting at location \a __A to packed
/// single-precision (32-bit) floating-point elements, and store the results in
/// \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm256_cvtneebf16_ps(const __m256bh *__A);
/// \endcode
///
/// This intrinsic corresponds to the \c VCVTNEEBF162PS instruction.
///
/// \param __A
/// A pointer to a 256-bit memory location containing 16 consecutive
/// BF16 (16-bit) floating-point values.
/// \returns
/// A 256-bit vector of [8 x float].
///
/// \code{.operation}
/// FOR j := 0 to 7
/// k := j*2
/// i := k*16
/// m := j*32
/// dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_cvtneebf16_ps(const __m256bh *__A) {
return (__m256)__builtin_ia32_vcvtneebf162ps256((const __v16bf *)__A);
}
/// Convert packed half-precision (16-bit) floating-point even-indexed elements
/// stored at memory locations starting at location \a __A to packed
/// single-precision (32-bit) floating-point elements, and store the results in
/// \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm_cvtneeph_ps(const __m128h *__A);
/// \endcode
///
/// This intrinsic corresponds to the \c VCVTNEEPH2PS instruction.
///
/// \param __A
/// A pointer to a 128-bit memory location containing 8 consecutive
/// half-precision (16-bit) floating-point values.
/// \returns
/// A 128-bit vector of [4 x float].
///
/// \code{.operation}
/// FOR j := 0 to 3
/// k := j*2
/// i := k*16
/// m := j*32
/// dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_cvtneeph_ps(const __m128h *__A) {
return (__m128)__builtin_ia32_vcvtneeph2ps128((const __v8hf *)__A);
}
/// Convert packed half-precision (16-bit) floating-point even-indexed elements
/// stored at memory locations starting at location \a __A to packed
/// single-precision (32-bit) floating-point elements, and store the results in
/// \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm256_cvtneeph_ps(const __m256h *__A);
/// \endcode
///
/// This intrinsic corresponds to the \c VCVTNEEPH2PS instruction.
///
/// \param __A
/// A pointer to a 256-bit memory location containing 16 consecutive
/// half-precision (16-bit) floating-point values.
/// \returns
/// A 256-bit vector of [8 x float].
///
/// \code{.operation}
/// FOR j := 0 to 7
/// k := j*2
/// i := k*16
/// m := j*32
/// dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_cvtneeph_ps(const __m256h *__A) {
return (__m256)__builtin_ia32_vcvtneeph2ps256((const __v16hf *)__A);
}
/// Convert packed BF16 (16-bit) floating-point odd-indexed elements
/// stored at memory locations starting at location \a __A to packed
/// single-precision (32-bit) floating-point elements, and store the results in
/// \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm_cvtneobf16_ps(const __m128bh *__A);
/// \endcode
///
/// This intrinsic corresponds to the \c VCVTNEOBF162PS instruction.
///
/// \param __A
/// A pointer to a 128-bit memory location containing 8 consecutive
/// BF16 (16-bit) floating-point values.
/// \returns
/// A 128-bit vector of [4 x float].
///
/// \code{.operation}
/// FOR j := 0 to 3
/// k := j*2+1
/// i := k*16
/// m := j*32
/// dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_cvtneobf16_ps(const __m128bh *__A) {
return (__m128)__builtin_ia32_vcvtneobf162ps128((const __v8bf *)__A);
}
/// Convert packed BF16 (16-bit) floating-point odd-indexed elements
/// stored at memory locations starting at location \a __A to packed
/// single-precision (32-bit) floating-point elements, and store the results in
/// \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm256_cvtneobf16_ps(const __m256bh *__A);
/// \endcode
///
/// This intrinsic corresponds to the \c VCVTNEOBF162PS instruction.
///
/// \param __A
/// A pointer to a 256-bit memory location containing 16 consecutive
/// BF16 (16-bit) floating-point values.
/// \returns
/// A 256-bit vector of [8 x float].
///
/// \code{.operation}
/// FOR j := 0 to 7
/// k := j*2+1
/// i := k*16
/// m := j*32
/// dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_cvtneobf16_ps(const __m256bh *__A) {
return (__m256)__builtin_ia32_vcvtneobf162ps256((const __v16bf *)__A);
}
/// Convert packed half-precision (16-bit) floating-point odd-indexed elements
/// stored at memory locations starting at location \a __A to packed
/// single-precision (32-bit) floating-point elements, and store the results in
/// \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm_cvtneoph_ps(const __m128h *__A);
/// \endcode
///
/// This intrinsic corresponds to the \c VCVTNEOPH2PS instruction.
///
/// \param __A
/// A pointer to a 128-bit memory location containing 8 consecutive
/// half-precision (16-bit) floating-point values.
/// \returns
/// A 128-bit vector of [4 x float].
///
/// \code{.operation}
/// FOR j := 0 to 3
/// k := j*2+1
/// i := k*16
/// m := j*32
/// dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_cvtneoph_ps(const __m128h *__A) {
return (__m128)__builtin_ia32_vcvtneoph2ps128((const __v8hf *)__A);
}
/// Convert packed half-precision (16-bit) floating-point odd-indexed elements
/// stored at memory locations starting at location \a __A to packed
/// single-precision (32-bit) floating-point elements, and store the results in
/// \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm256_cvtneoph_ps(const __m256h *__A);
/// \endcode
///
/// This intrinsic corresponds to the \c VCVTNEOPH2PS instruction.
///
/// \param __A
/// A pointer to a 256-bit memory location containing 16 consecutive
/// half-precision (16-bit) floating-point values.
/// \returns
/// A 256-bit vector of [8 x float].
///
/// \code{.operation}
/// FOR j := 0 to 7
/// k := j*2+1
/// i := k*16
/// m := j*32
/// dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_cvtneoph_ps(const __m256h *__A) {
return (__m256)__builtin_ia32_vcvtneoph2ps256((const __v16hf *)__A);
}
/// Convert packed single-precision (32-bit) floating-point elements in \a __A
/// to packed BF16 (16-bit) floating-point elements, and store the results in \a
/// dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm_cvtneps_avx_pbh(__m128 __A);
/// \endcode
///
/// This intrinsic corresponds to the \c VCVTNEPS2BF16 instruction.
///
/// \param __A
/// A 128-bit vector of [4 x float].
/// \returns
/// A 128-bit vector of [8 x bfloat].
///
/// \code{.operation}
/// FOR j := 0 to 3
/// dst.word[j] := Convert_FP32_To_BF16(__A.fp32[j])
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
static __inline__ __m128bh __DEFAULT_FN_ATTRS128
_mm_cvtneps_avx_pbh(__m128 __A) {
return (__m128bh)__builtin_ia32_vcvtneps2bf16128((__v4sf)__A);
}
/// Convert packed single-precision (32-bit) floating-point elements in \a __A
/// to packed BF16 (16-bit) floating-point elements, and store the results in \a
/// dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm256_cvtneps_avx_pbh(__m256 __A);
/// \endcode
///
/// This intrinsic corresponds to the \c VCVTNEPS2BF16 instruction.
///
/// \param __A
/// A 256-bit vector of [8 x float].
/// \returns
/// A 128-bit vector of [8 x bfloat].
///
/// \code{.operation}
/// FOR j := 0 to 7
/// dst.word[j] := Convert_FP32_To_BF16(a.fp32[j])
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
static __inline__ __m128bh __DEFAULT_FN_ATTRS256
_mm256_cvtneps_avx_pbh(__m256 __A) {
return (__m128bh)__builtin_ia32_vcvtneps2bf16256((__v8sf)__A);
}
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
#endif // __AVXNECONVERTINTRIN_H
#endif // __SSE2__
+471
View File
@@ -0,0 +1,471 @@
/*===-------- avxvnniint8intrin.h - AVXVNNIINT8 intrinsics -----------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error \
"Never use <avxvnniint8intrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVXVNNIINT8INTRIN_H
#define __AVXVNNIINT8INTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS256 \
__attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"), \
__min_vector_width__(256)))
#define __DEFAULT_FN_ATTRS128 \
__attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"), \
__min_vector_width__(128)))
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
/// signed 16-bit results. Sum these 4 results with the corresponding
/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm_dpbssd_epi32(__m128i __W, __m128i __A, __m128i __B);
/// \endcode
///
/// This intrinsic corresponds to the \c VPDPBSSD instruction.
///
/// \param __A
/// A 128-bit vector of [16 x char].
/// \param __B
/// A 128-bit vector of [16 x char].
/// \returns
/// A 128-bit vector of [4 x int].
///
/// \code{.operation}
/// FOR j := 0 to 3
/// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
/// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
/// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
/// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssd_epi32(__m128i __W,
__m128i __A,
__m128i __B) {
return (__m128i)__builtin_ia32_vpdpbssd128((__v4si)__W, (__v4si)__A,
(__v4si)__B);
}
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
/// signed 16-bit results. Sum these 4 results with the corresponding
/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B);
/// \endcode
///
/// This intrinsic corresponds to the \c VPDPBSSD instruction.
///
/// \param __A
/// A 256-bit vector of [32 x char].
/// \param __B
/// A 256-bit vector of [32 x char].
/// \returns
/// A 256-bit vector of [8 x int].
///
/// \code{.operation}
/// FOR j := 0 to 7
/// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
/// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
/// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
/// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B) {
return (__m256i)__builtin_ia32_vpdpbssd256((__v8si)__W, (__v8si)__A,
(__v8si)__B);
}
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
/// signed 16-bit results. Sum these 4 results with the corresponding
/// 32-bit integer in \a __W with signed saturation, and store the packed
/// 32-bit results in \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm_dpbssds_epi32( __m128i __W, __m128i __A, __m128i __B);
/// \endcode
///
/// This intrinsic corresponds to the \c VPDPBSSD instruction.
///
/// \param __A
/// A 128-bit vector of [16 x char].
/// \param __B
/// A 128-bit vector of [16 x char].
/// \returns
/// A 128-bit vector of [4 x int].
///
/// \code{.operation}
/// FOR j := 0 to 3
/// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
/// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
/// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
/// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssds_epi32(__m128i __W,
__m128i __A,
__m128i __B) {
return (__m128i)__builtin_ia32_vpdpbssds128((__v4si)__W, (__v4si)__A,
(__v4si)__B);
}
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
/// signed 16-bit results. Sum these 4 results with the corresponding
/// 32-bit integer in \a __W with signed saturation, and store the packed
/// 32-bit results in \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B);
/// \endcode
///
/// This intrinsic corresponds to the \c VPDPBSSD instruction.
///
/// \param __A
/// A 256-bit vector of [32 x char].
/// \param __B
/// A 256-bit vector of [32 x char].
/// \returns
/// A 256-bit vector of [8 x int].
///
/// \code{.operation}
/// FOR j := 0 to 7
/// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
/// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
/// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
/// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B) {
return (__m256i)__builtin_ia32_vpdpbssds256((__v8si)__W, (__v8si)__A,
(__v8si)__B);
}
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
/// signed 16-bit results. Sum these 4 results with the corresponding
/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B);
/// \endcode
///
/// This intrinsic corresponds to the \c VPDPBSSD instruction.
///
/// \param __A
/// A 128-bit vector of [16 x char].
/// \param __B
/// A 128-bit vector of [16 x unsigned char].
/// \returns
/// A 128-bit vector of [4 x int].
///
/// \code{.operation}
/// FOR j := 0 to 3
/// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
/// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
/// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
/// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsud_epi32(__m128i __W,
__m128i __A,
__m128i __B) {
return (__m128i)__builtin_ia32_vpdpbsud128((__v4si)__W, (__v4si)__A,
(__v4si)__B);
}
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
/// signed 16-bit results. Sum these 4 results with the corresponding
/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B);
/// \endcode
///
/// This intrinsic corresponds to the \c VPDPBSSD instruction.
///
/// \param __A
/// A 256-bit vector of [32 x char].
/// \param __B
/// A 256-bit vector of [32 x unsigned char].
/// \returns
/// A 256-bit vector of [8 x int].
///
/// \code{.operation}
/// FOR j := 0 to 7
/// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
/// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
/// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
/// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
return (__m256i)__builtin_ia32_vpdpbsud256((__v8si)__W, (__v8si)__A,
(__v8si)__B);
}
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
/// signed 16-bit results. Sum these 4 results with the corresponding
/// 32-bit integer in \a __W with signed saturation, and store the packed
/// 32-bit results in \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm_dpbsuds_epi32( __m128i __W, __m128i __A, __m128i __B);
/// \endcode
///
/// This intrinsic corresponds to the \c VPDPBSSD instruction.
///
/// \param __A
/// A 128-bit vector of [16 x char].
/// \param __B
/// A 128-bit vector of [16 x unsigned char].
/// \returns
/// A 128-bit vector of [4 x int].
///
/// \code{.operation}
/// FOR j := 0 to 3
/// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
/// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
/// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
/// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsuds_epi32(__m128i __W,
__m128i __A,
__m128i __B) {
return (__m128i)__builtin_ia32_vpdpbsuds128((__v4si)__W, (__v4si)__A,
(__v4si)__B);
}
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
/// signed 16-bit results. Sum these 4 results with the corresponding
/// 32-bit integer in \a __W with signed saturation, and store the packed
/// 32-bit results in \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B);
/// \endcode
///
/// This intrinsic corresponds to the \c VPDPBSSD instruction.
///
/// \param __A
/// A 256-bit vector of [32 x char].
/// \param __B
/// A 256-bit vector of [32 x unsigned char].
/// \returns
/// A 256-bit vector of [8 x int].
///
/// \code{.operation}
/// FOR j := 0 to 7
/// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
/// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
/// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
/// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
return (__m256i)__builtin_ia32_vpdpbsuds256((__v8si)__W, (__v8si)__A,
(__v8si)__B);
}
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
/// signed 16-bit results. Sum these 4 results with the corresponding
/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B);
/// \endcode
///
/// This intrinsic corresponds to the \c VPDPBSSD instruction.
///
/// \param __A
/// A 128-bit vector of [16 x unsigned char].
/// \param __B
/// A 128-bit vector of [16 x unsigned char].
/// \returns
/// A 128-bit vector of [4 x int].
///
/// \code{.operation}
/// FOR j := 0 to 3
/// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
/// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
/// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
/// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuud_epi32(__m128i __W,
__m128i __A,
__m128i __B) {
return (__m128i)__builtin_ia32_vpdpbuud128((__v4si)__W, (__v4si)__A,
(__v4si)__B);
}
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
/// signed 16-bit results. Sum these 4 results with the corresponding
/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B);
/// \endcode
///
/// This intrinsic corresponds to the \c VPDPBSSD instruction.
///
/// \param __A
/// A 256-bit vector of [32 x unsigned char].
/// \param __B
/// A 256-bit vector of [32 x unsigned char].
/// \returns
/// A 256-bit vector of [8 x int].
///
/// \code{.operation}
/// FOR j := 0 to 7
/// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
/// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
/// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
/// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
return (__m256i)__builtin_ia32_vpdpbuud256((__v8si)__W, (__v8si)__A,
(__v8si)__B);
}
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
/// signed 16-bit results. Sum these 4 results with the corresponding
/// 32-bit integer in \a __W with signed saturation, and store the packed
/// 32-bit results in \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm_dpbuuds_epi32( __m128i __W, __m128i __A, __m128i __B);
/// \endcode
///
/// This intrinsic corresponds to the \c VPDPBUUDS instruction.
///
/// \param __A
/// A 128-bit vector of [16 x unsigned char].
/// \param __B
/// A 128-bit vector of [16 x unsigned char].
/// \returns
/// A 128-bit vector of [4 x int].
///
/// \code{.operation}
/// FOR j := 0 to 3
/// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
/// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
/// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
/// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
/// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuuds_epi32(__m128i __W,
__m128i __A,
__m128i __B) {
return (__m128i)__builtin_ia32_vpdpbuuds128((__v4si)__W, (__v4si)__A,
(__v4si)__B);
}
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
/// signed 16-bit results. Sum these 4 results with the corresponding
/// 32-bit integer in \a __W with signed saturation, and store the packed
/// 32-bit results in \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B);
/// \endcode
///
/// This intrinsic corresponds to the \c VPDPBUUDS instruction.
///
/// \param __A
/// A 256-bit vector of [32 x unsigned char].
/// \param __B
/// A 256-bit vector of [32 x unsigned char].
/// \returns
/// A 256-bit vector of [8 x int].
///
/// \code{.operation}
/// FOR j := 0 to 7
/// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
/// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
/// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
/// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
/// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
return (__m256i)__builtin_ia32_vpdpbuuds256((__v8si)__W, (__v8si)__A,
(__v8si)__B);
}
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
#endif // __AVXVNNIINT8INTRIN_H
+70
View File
@@ -0,0 +1,70 @@
/*===--------------- cmpccxaddintrin.h - CMPCCXADD intrinsics--------------===
*
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __X86GPRINTRIN_H
#error \
"Never use <cmpccxaddintrin.h> directly; include <x86gprintrin.h> instead."
#endif // __X86GPRINTRIN_H
#ifndef __CMPCCXADDINTRIN_H
#define __CMPCCXADDINTRIN_H
#ifdef __x86_64__
typedef enum {
_CMPCCX_O, /* Overflow. */
_CMPCCX_NO, /* No overflow. */
_CMPCCX_B, /* Below. */
_CMPCCX_NB, /* Not below. */
_CMPCCX_Z, /* Zero. */
_CMPCCX_NZ, /* Not zero. */
_CMPCCX_BE, /* Below or equal. */
_CMPCCX_NBE, /* Neither below nor equal. */
_CMPCCX_S, /* Sign. */
_CMPCCX_NS, /* No sign. */
_CMPCCX_P, /* Parity. */
_CMPCCX_NP, /* No parity. */
_CMPCCX_L, /* Less. */
_CMPCCX_NL, /* Not less. */
_CMPCCX_LE, /* Less or equal. */
_CMPCCX_NLE, /* Neither less nor equal. */
} _CMPCCX_ENUM;
/// Compares the value from the memory __A with the value of __B. If the
/// specified condition __D is met, then add the third operand __C to the
/// __A and write it into __A, else the value of __A is unchanged. The return
/// value is the original value of __A.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c CMPCCXADD instructions.
///
/// \param __A
/// __A pointer specifying the memory address.
///
/// \param __B
/// A integer operand.
///
/// \param __C
/// A integer operand.
///
/// \param __D
/// The specified condition.
///
/// \returns a integer which is the original value of first operand.
#define _cmpccxadd_epi32(__A, __B, __C, __D) \
((int)(__builtin_ia32_cmpccxadd32((void *)(__A), (int)(__B), (int)(__C), \
(int)(__D))))
#define _cmpccxadd_epi64(__A, __B, __C, __D) \
((long long)(__builtin_ia32_cmpccxadd64((void *)(__A), (long long)(__B), \
(long long)(__C), (int)(__D))))
#endif // __x86_64__
#endif // __CMPCCXADDINTRIN_H
+11 -1
View File
@@ -200,9 +200,18 @@
#define bit_AMXINT8 0x02000000
/* Features in %eax for leaf 7 sub-leaf 1 */
#define bit_RAOINT 0x00000008
#define bit_AVXVNNI 0x00000010
#define bit_AVX512BF16 0x00000020
#define bit_CMPCCXADD 0x00000080
#define bit_AMXFP16 0x00200000
#define bit_HRESET 0x00400000
#define bit_AVXIFMA 0x00800000
/* Features in %edx for leaf 7 sub-leaf 1 */
#define bit_AVXVNNIINT8 0x00000010
#define bit_AVXNECONVERT 0x00000020
#define bit_PREFETCHI 0x00004000
/* Features in %eax for leaf 13 sub-leaf 1 */
#define bit_XSAVEOPT 0x00000001
@@ -261,7 +270,8 @@
: "0"(__leaf), "2"(__count))
#endif
static __inline int __get_cpuid_max (unsigned int __leaf, unsigned int *__sig)
static __inline unsigned int __get_cpuid_max (unsigned int __leaf,
unsigned int *__sig)
{
unsigned int __eax, __ebx, __ecx, __edx;
#if __i386__
+90
View File
@@ -0,0 +1,90 @@
/*===---- cmath - CUDA wrapper for <cmath> ---------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __CLANG_CUDA_WRAPPERS_CMATH
#define __CLANG_CUDA_WRAPPERS_CMATH
#include_next <cmath>
#if defined(_LIBCPP_STD_VER)
// libc++ will need long double variants of these functions, but CUDA does not
// provide them. We'll provide their declarations, which should allow the
// headers to parse, but would not allow accidental use of them on a GPU.
__attribute__((device)) long double logb(long double);
__attribute__((device)) long double scalbn(long double, int);
namespace std {
// For __constexpr_fmin/fmax we only need device-side overloads before c++14
// where they are not constexpr.
#if _LIBCPP_STD_VER < 14
__attribute__((device))
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 float __constexpr_fmax(float __x, float __y) _NOEXCEPT {
return __builtin_fmaxf(__x, __y);
}
__attribute__((device))
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 double __constexpr_fmax(double __x, double __y) _NOEXCEPT {
return __builtin_fmax(__x, __y);
}
__attribute__((device))
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 long double
__constexpr_fmax(long double __x, long double __y) _NOEXCEPT {
return __builtin_fmaxl(__x, __y);
}
template <class _Tp, class _Up, __enable_if_t<is_arithmetic<_Tp>::value && is_arithmetic<_Up>::value, int> = 0>
__attribute__((device))
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 typename __promote<_Tp, _Up>::type
__constexpr_fmax(_Tp __x, _Up __y) _NOEXCEPT {
using __result_type = typename __promote<_Tp, _Up>::type;
return std::__constexpr_fmax(static_cast<__result_type>(__x), static_cast<__result_type>(__y));
}
#endif // _LIBCPP_STD_VER < 14
// For logb/scalbn templates we must always provide device overloads because
// libc++ implementation uses __builtin_XXX which gets translated into a libcall
// which we can't handle on GPU. We need to forward those to CUDA-provided
// implementations.
template <class _Tp>
__attribute__((device))
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp __constexpr_logb(_Tp __x) {
return ::logb(__x);
}
template <class _Tp>
__attribute__((device))
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp __constexpr_scalbn(_Tp __x, int __exp) {
return ::scalbn(__x, __exp);
}
} // namespace std//
#endif // _LIBCPP_STD_VER
#endif // include guard
+11 -1
View File
@@ -38,6 +38,16 @@ typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
* appear in the interface though. */
typedef signed char __v16qs __attribute__((__vector_size__(16)));
#ifdef __SSE2__
/* Both _Float16 and __bf16 require SSE2 being enabled. */
typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16)));
typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1)));
typedef __bf16 __v8bf __attribute__((__vector_size__(16), __aligned__(16)));
typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
#endif
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("sse2"), \
@@ -1809,7 +1819,7 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w,
/// \returns An initialized 128-bit floating-point vector of [2 x double] with
/// all elements set to zero.
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void) {
return __extension__(__m128d){0, 0};
return __extension__(__m128d){0.0, 0.0};
}
/// Constructs a 128-bit floating-point vector of [2 x double]. The lower
+17 -10
View File
@@ -38,9 +38,10 @@
# undef FLT_MANT_DIG
# undef DBL_MANT_DIG
# undef LDBL_MANT_DIG
# if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__) || \
__cplusplus >= 201103L || \
(__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \
!defined(__STRICT_ANSI__) || \
(defined(__cplusplus) && __cplusplus >= 201103L) || \
(__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
# undef DECIMAL_DIG
# endif
# undef FLT_DIG
@@ -67,9 +68,10 @@
# undef FLT_MIN
# undef DBL_MIN
# undef LDBL_MIN
# if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__) || \
__cplusplus >= 201703L || \
(__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) || \
!defined(__STRICT_ANSI__) || \
(defined(__cplusplus) && __cplusplus >= 201703L) || \
(__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
# undef FLT_TRUE_MIN
# undef DBL_TRUE_MIN
# undef LDBL_TRUE_MIN
@@ -84,7 +86,10 @@
/* Characteristics of floating point types, C99 5.2.4.2.2 */
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \
(defined(__cplusplus) && __cplusplus >= 201103L)
#define FLT_EVAL_METHOD __FLT_EVAL_METHOD__
#endif
#define FLT_ROUNDS (__builtin_flt_rounds())
#define FLT_RADIX __FLT_RADIX__
@@ -92,8 +97,9 @@
#define DBL_MANT_DIG __DBL_MANT_DIG__
#define LDBL_MANT_DIG __LDBL_MANT_DIG__
#if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__) || \
__cplusplus >= 201103L || \
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \
!defined(__STRICT_ANSI__) || \
(defined(__cplusplus) && __cplusplus >= 201103L) || \
(__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
# define DECIMAL_DIG __DECIMAL_DIG__
#endif
@@ -130,8 +136,9 @@
#define DBL_MIN __DBL_MIN__
#define LDBL_MIN __LDBL_MIN__
#if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__) || \
__cplusplus >= 201703L || \
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) || \
!defined(__STRICT_ANSI__) || \
(defined(__cplusplus) && __cplusplus >= 201703L) || \
(__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
# define FLT_TRUE_MIN __FLT_DENORM_MIN__
# define DBL_TRUE_MIN __DBL_DENORM_MIN__
+7 -5
View File
@@ -20,10 +20,12 @@
/* Default attributes for YMM unmasked form. */
#define __DEFAULT_FN_ATTRS_Y __attribute__((__always_inline__, __nodebug__, __target__("avx,gfni"), __min_vector_width__(256)))
/* Default attributes for ZMM forms. */
#define __DEFAULT_FN_ATTRS_Z __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,gfni"), __min_vector_width__(512)))
/* Default attributes for ZMM unmasked forms. */
#define __DEFAULT_FN_ATTRS_Z __attribute__((__always_inline__, __nodebug__, __target__("avx512f,gfni"), __min_vector_width__(512)))
/* Default attributes for ZMM masked forms. */
#define __DEFAULT_FN_ATTRS_Z_MASK __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,gfni"), __min_vector_width__(512)))
/* Default attributes for VLX forms. */
/* Default attributes for VLX masked forms. */
#define __DEFAULT_FN_ATTRS_VL128 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS_VL256 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(256)))
@@ -99,7 +101,7 @@ _mm512_gf2p8mul_epi8(__m512i __A, __m512i __B)
(__v64qi) __B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS_Z
static __inline__ __m512i __DEFAULT_FN_ATTRS_Z_MASK
_mm512_mask_gf2p8mul_epi8(__m512i __S, __mmask64 __U, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_selectb_512(__U,
@@ -107,7 +109,7 @@ _mm512_mask_gf2p8mul_epi8(__m512i __S, __mmask64 __U, __m512i __A, __m512i __B)
(__v64qi) __S);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS_Z
static __inline__ __m512i __DEFAULT_FN_ATTRS_Z_MASK
_mm512_maskz_gf2p8mul_epi8(__mmask64 __U, __m512i __A, __m512i __B)
{
return _mm512_mask_gf2p8mul_epi8((__m512i)_mm512_setzero_si512(),
-15
View File
@@ -1,15 +0,0 @@
//===----- hlsl.h - HLSL definitions --------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef _HLSL_H_
#define _HLSL_H_
#include "hlsl/hlsl_basic_types.h"
#include "hlsl/hlsl_intrinsics.h"
#endif //_HLSL_H_
-64
View File
@@ -1,64 +0,0 @@
//===----- hlsl_basic_types.h - HLSL definitions for basic types ----------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef _HLSL_HLSL_BASIC_TYPES_H_
#define _HLSL_HLSL_BASIC_TYPES_H_
// built-in scalar data types:
#ifdef __HLSL_ENABLE_16_BIT
// 16-bit integer.
typedef unsigned short uint16_t;
typedef short int16_t;
#endif
// unsigned 32-bit integer.
typedef unsigned int uint;
// 64-bit integer.
typedef unsigned long uint64_t;
typedef long int64_t;
// built-in vector data types:
#ifdef __HLSL_ENABLE_16_BIT
typedef vector<int16_t, 2> int16_t2;
typedef vector<int16_t, 3> int16_t3;
typedef vector<int16_t, 4> int16_t4;
typedef vector<uint16_t, 2> uint16_t2;
typedef vector<uint16_t, 3> uint16_t3;
typedef vector<uint16_t, 4> uint16_t4;
#endif
typedef vector<int, 2> int2;
typedef vector<int, 3> int3;
typedef vector<int, 4> int4;
typedef vector<uint, 2> uint2;
typedef vector<uint, 3> uint3;
typedef vector<uint, 4> uint4;
typedef vector<int64_t, 2> int64_t2;
typedef vector<int64_t, 3> int64_t3;
typedef vector<int64_t, 4> int64_t4;
typedef vector<uint64_t, 2> uint64_t2;
typedef vector<uint64_t, 3> uint64_t3;
typedef vector<uint64_t, 4> uint64_t4;
#ifdef __HLSL_ENABLE_16_BIT
typedef vector<half, 2> half2;
typedef vector<half, 3> half3;
typedef vector<half, 4> half4;
#endif
typedef vector<float, 2> float2;
typedef vector<float, 3> float3;
typedef vector<float, 4> float4;
typedef vector<double, 2> double2;
typedef vector<double, 3> double3;
typedef vector<double, 4> double4;
#endif //_HLSL_HLSL_BASIC_TYPES_H_
-15
View File
@@ -1,15 +0,0 @@
//===----- hlsl_intrinsics.h - HLSL definitions for intrinsics ----------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef _HLSL_HLSL_INTRINSICS_H_
#define _HLSL_HLSL_INTRINSICS_H_
__attribute__((clang_builtin_alias(__builtin_hlsl_wave_active_count_bits))) uint
WaveActiveCountBits(bool bBit);
#endif //_HLSL_HLSL_INTRINSICS_H_
+40 -8
View File
@@ -189,6 +189,11 @@
#include <avx512ifmavlintrin.h>
#endif
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
defined(__AVXIFMA__)
#include <avxifmaintrin.h>
#endif
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
defined(__AVX512VBMI__)
#include <avx512vbmiintrin.h>
@@ -214,17 +219,13 @@
#include <avx512pfintrin.h>
#endif
/*
* FIXME: _Float16 type is legal only when HW support float16 operation.
* We use __AVX512FP16__ to identify if float16 is supported or not, so
* when float16 is not supported, the related header is not included.
*
*/
#if defined(__AVX512FP16__)
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
defined(__AVX512FP16__)
#include <avx512fp16intrin.h>
#endif
#if defined(__AVX512FP16__) && defined(__AVX512VL__)
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
(defined(__AVX512VL__) && defined(__AVX512FP16__))
#include <avx512vlfp16intrin.h>
#endif
@@ -258,6 +259,16 @@
#include <gfniintrin.h>
#endif
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
defined(__AVXVNNIINT8__)
#include <avxvnniint8intrin.h>
#endif
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
defined(__AVXNECONVERT__)
#include <avxneconvertintrin.h>
#endif
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
defined(__RDPID__)
/// Returns the value of the IA32_TSC_AUX MSR (0xc0000103).
@@ -291,6 +302,23 @@ _rdrand64_step(unsigned long long *__p)
{
return (int)__builtin_ia32_rdrand64_step(__p);
}
#else
// We need to emulate the functionality of 64-bit rdrand with 2 32-bit
// rdrand instructions.
static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
_rdrand64_step(unsigned long long *__p)
{
unsigned int __lo, __hi;
unsigned int __res_lo = __builtin_ia32_rdrand32_step(&__lo);
unsigned int __res_hi = __builtin_ia32_rdrand32_step(&__hi);
if (__res_lo && __res_hi) {
*__p = ((unsigned long long)__hi << 32) | (unsigned long long)__lo;
return 1;
} else {
*__p = 0;
return 0;
}
}
#endif
#endif /* __RDRND__ */
@@ -495,6 +523,10 @@ _storebe_i64(void * __P, long long __D) {
defined(__INVPCID__)
#include <invpcidintrin.h>
#endif
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
defined(__AMXFP16__)
#include <amxfp16intrin.h>
#endif
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
defined(__KL__) || defined(__WIDEKL__)
+234
View File
@@ -0,0 +1,234 @@
/*===------------ larchintrin.h - LoongArch intrinsics ---------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef _LOONGARCH_BASE_INTRIN_H
#define _LOONGARCH_BASE_INTRIN_H
#ifdef __cplusplus
extern "C" {
#endif
typedef struct rdtime {
unsigned int value;
unsigned int timeid;
} __rdtime_t;
#if __loongarch_grlen == 64
typedef struct drdtime {
unsigned long dvalue;
unsigned long dtimeid;
} __drdtime_t;
extern __inline __drdtime_t
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__rdtime_d(void) {
__drdtime_t __drdtime;
__asm__ volatile(
"rdtime.d %[val], %[tid]\n\t"
: [val] "=&r"(__drdtime.dvalue), [tid] "=&r"(__drdtime.dtimeid));
return __drdtime;
}
#endif
extern __inline __rdtime_t
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__rdtimeh_w(void) {
__rdtime_t __rdtime;
__asm__ volatile("rdtimeh.w %[val], %[tid]\n\t"
: [val] "=&r"(__rdtime.value), [tid] "=&r"(__rdtime.timeid));
return __rdtime;
}
extern __inline __rdtime_t
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__rdtimel_w(void) {
__rdtime_t __rdtime;
__asm__ volatile("rdtimel.w %[val], %[tid]\n\t"
: [val] "=&r"(__rdtime.value), [tid] "=&r"(__rdtime.timeid));
return __rdtime;
}
#if __loongarch_grlen == 64
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__crc_w_b_w(char _1, int _2) {
return (int)__builtin_loongarch_crc_w_b_w((char)_1, (int)_2);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__crc_w_h_w(short _1, int _2) {
return (int)__builtin_loongarch_crc_w_h_w((short)_1, (int)_2);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__crc_w_w_w(int _1, int _2) {
return (int)__builtin_loongarch_crc_w_w_w((int)_1, (int)_2);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__crc_w_d_w(long int _1, int _2) {
return (int)__builtin_loongarch_crc_w_d_w((long int)_1, (int)_2);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__crcc_w_b_w(char _1, int _2) {
return (int)__builtin_loongarch_crcc_w_b_w((char)_1, (int)_2);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__crcc_w_h_w(short _1, int _2) {
return (int)__builtin_loongarch_crcc_w_h_w((short)_1, (int)_2);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__crcc_w_w_w(int _1, int _2) {
return (int)__builtin_loongarch_crcc_w_w_w((int)_1, (int)_2);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__crcc_w_d_w(long int _1, int _2) {
return (int)__builtin_loongarch_crcc_w_d_w((long int)_1, (int)_2);
}
#endif
#define __break(/*ui15*/ _1) __builtin_loongarch_break((_1))
#if __loongarch_grlen == 32
#define __cacop_w(/*uimm5*/ _1, /*unsigned int*/ _2, /*simm12*/ _3) \
((void)__builtin_loongarch_cacop_w((_1), (unsigned int)(_2), (_3)))
#endif
#if __loongarch_grlen == 64
#define __cacop_d(/*uimm5*/ _1, /*unsigned long int*/ _2, /*simm12*/ _3) \
((void)__builtin_loongarch_cacop_d((_1), (unsigned long int)(_2), (_3)))
#endif
#define __dbar(/*ui15*/ _1) __builtin_loongarch_dbar((_1))
#define __ibar(/*ui15*/ _1) __builtin_loongarch_ibar((_1))
#define __movfcsr2gr(/*ui5*/ _1) __builtin_loongarch_movfcsr2gr((_1));
#define __movgr2fcsr(/*ui5*/ _1, _2) \
__builtin_loongarch_movgr2fcsr((_1), (unsigned int)_2);
#define __syscall(/*ui15*/ _1) __builtin_loongarch_syscall((_1))
#define __csrrd_w(/*ui14*/ _1) ((unsigned int)__builtin_loongarch_csrrd_w((_1)))
#define __csrwr_w(/*unsigned int*/ _1, /*ui14*/ _2) \
((unsigned int)__builtin_loongarch_csrwr_w((unsigned int)(_1), (_2)))
#define __csrxchg_w(/*unsigned int*/ _1, /*unsigned int*/ _2, /*ui14*/ _3) \
((unsigned int)__builtin_loongarch_csrxchg_w((unsigned int)(_1), \
(unsigned int)(_2), (_3)))
#if __loongarch_grlen == 64
#define __csrrd_d(/*ui14*/ _1) \
((unsigned long int)__builtin_loongarch_csrrd_d((_1)))
#define __csrwr_d(/*unsigned long int*/ _1, /*ui14*/ _2) \
((unsigned long int)__builtin_loongarch_csrwr_d((unsigned long int)(_1), \
(_2)))
#define __csrxchg_d(/*unsigned long int*/ _1, /*unsigned long int*/ _2, \
/*ui14*/ _3) \
((unsigned long int)__builtin_loongarch_csrxchg_d( \
(unsigned long int)(_1), (unsigned long int)(_2), (_3)))
#endif
extern __inline unsigned char
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__iocsrrd_b(unsigned int _1) {
return (unsigned char)__builtin_loongarch_iocsrrd_b((unsigned int)_1);
}
extern __inline unsigned char
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__iocsrrd_h(unsigned int _1) {
return (unsigned short)__builtin_loongarch_iocsrrd_h((unsigned int)_1);
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__iocsrrd_w(unsigned int _1) {
return (unsigned int)__builtin_loongarch_iocsrrd_w((unsigned int)_1);
}
#if __loongarch_grlen == 64
extern __inline unsigned long int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__iocsrrd_d(unsigned int _1) {
return (unsigned long int)__builtin_loongarch_iocsrrd_d((unsigned int)_1);
}
#endif
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__iocsrwr_b(unsigned char _1, unsigned int _2) {
__builtin_loongarch_iocsrwr_b((unsigned char)_1, (unsigned int)_2);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__iocsrwr_h(unsigned short _1, unsigned int _2) {
__builtin_loongarch_iocsrwr_h((unsigned short)_1, (unsigned int)_2);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__iocsrwr_w(unsigned int _1, unsigned int _2) {
__builtin_loongarch_iocsrwr_w((unsigned int)_1, (unsigned int)_2);
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__cpucfg(unsigned int _1) {
return (unsigned int)__builtin_loongarch_cpucfg((unsigned int)_1);
}
#if __loongarch_grlen == 64
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__iocsrwr_d(unsigned long int _1, unsigned int _2) {
__builtin_loongarch_iocsrwr_d((unsigned long int)_1, (unsigned int)_2);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__asrtgt_d(long int _1, long int _2) {
__builtin_loongarch_asrtgt_d((long int)_1, (long int)_2);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__asrtle_d(long int _1, long int _2) {
__builtin_loongarch_asrtle_d((long int)_1, (long int)_2);
}
#endif
#if __loongarch_grlen == 64
#define __lddir_d(/*long int*/ _1, /*ui5*/ _2) \
((long int)__builtin_loongarch_lddir_d((long int)(_1), (_2)))
#define __ldpte_d(/*long int*/ _1, /*ui5*/ _2) \
((void)__builtin_loongarch_ldpte_d((long int)(_1), (_2)))
#endif
#ifdef __cplusplus
}
#endif
#endif /* _LOONGARCH_BASE_INTRIN_H */
+3 -2
View File
@@ -65,7 +65,7 @@
/* C2x 5.2.4.2.1 */
/* FIXME: This is using the placeholder dates Clang produces for these macros
in C2x mode; switch to the correct values once they've been published. */
#if __STDC_VERSION__ >= 202000L
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
#define BOOL_WIDTH __BOOL_WIDTH__
#define CHAR_WIDTH CHAR_BIT
#define SCHAR_WIDTH CHAR_BIT
@@ -93,7 +93,8 @@
/* C99 5.2.4.2.1: Added long long.
C++11 18.3.3.2: same contents as the Standard C Library header <limits.h>.
*/
#if __STDC_VERSION__ >= 199901L || __cplusplus >= 201103L
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \
(defined(__cplusplus) && __cplusplus >= 201103L)
#undef LLONG_MIN
#undef LLONG_MAX
+19
View File
@@ -74,6 +74,25 @@
#define __opencl_c_atomic_scope_all_devices 1
#define __opencl_c_read_write_images 1
#endif // defined(__SPIR__)
// Undefine any feature macros that have been explicitly disabled using
// an __undef_<feature> macro.
#ifdef __undef___opencl_c_work_group_collective_functions
#undef __opencl_c_work_group_collective_functions
#endif
#ifdef __undef___opencl_c_atomic_order_seq_cst
#undef __opencl_c_atomic_order_seq_cst
#endif
#ifdef __undef___opencl_c_atomic_scope_device
#undef __opencl_c_atomic_scope_device
#endif
#ifdef __undef___opencl_c_atomic_scope_all_devices
#undef __opencl_c_atomic_scope_all_devices
#endif
#ifdef __undef___opencl_c_read_write_images
#undef __opencl_c_read_write_images
#endif
#endif // (__OPENCL_CPP_VERSION__ == 202100 || __OPENCL_C_VERSION__ == 300)
#if !defined(__opencl_c_generic_address_space)
+175 -145
View File
@@ -12396,11 +12396,11 @@ void __ovld vstorea_half16_rtn(double16, size_t, __private half *);
* image objects and then want to read the updated data.
*/
void __ovld __conv barrier(cl_mem_fence_flags flags);
void __ovld __conv barrier(cl_mem_fence_flags);
#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
void __ovld __conv work_group_barrier(cl_mem_fence_flags flags, memory_scope);
void __ovld __conv work_group_barrier(cl_mem_fence_flags flags);
void __ovld __conv work_group_barrier(cl_mem_fence_flags, memory_scope);
void __ovld __conv work_group_barrier(cl_mem_fence_flags);
#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
// OpenCL v1.1 s6.11.9, v1.2 s6.12.9 - Explicit Memory Fence Functions
@@ -12418,7 +12418,7 @@ void __ovld __conv work_group_barrier(cl_mem_fence_flags flags);
* CLK_LOCAL_MEM_FENCE
* CLK_GLOBAL_MEM_FENCE.
*/
void __ovld mem_fence(cl_mem_fence_flags flags);
void __ovld mem_fence(cl_mem_fence_flags);
/**
* Read memory barrier that orders only
@@ -12430,7 +12430,7 @@ void __ovld mem_fence(cl_mem_fence_flags flags);
* CLK_LOCAL_MEM_FENCE
* CLK_GLOBAL_MEM_FENCE.
*/
void __ovld read_mem_fence(cl_mem_fence_flags flags);
void __ovld read_mem_fence(cl_mem_fence_flags);
/**
* Write memory barrier that orders only
@@ -12442,7 +12442,7 @@ void __ovld read_mem_fence(cl_mem_fence_flags flags);
* CLK_LOCAL_MEM_FENCE
* CLK_GLOBAL_MEM_FENCE.
*/
void __ovld write_mem_fence(cl_mem_fence_flags flags);
void __ovld write_mem_fence(cl_mem_fence_flags);
// OpenCL v2.0 s6.13.9 - Address Space Qualifier Functions
@@ -12891,29 +12891,29 @@ void __ovld prefetch(const __global half16 *, size_t);
* (old + val) and store result at location
* pointed by p. The function returns old.
*/
int __ovld atomic_add(volatile __global int *p, int val);
uint __ovld atomic_add(volatile __global uint *p, uint val);
int __ovld atomic_add(volatile __local int *p, int val);
uint __ovld atomic_add(volatile __local uint *p, uint val);
int __ovld atomic_add(volatile __global int *, int);
uint __ovld atomic_add(volatile __global uint *, uint);
int __ovld atomic_add(volatile __local int *, int);
uint __ovld atomic_add(volatile __local uint *, uint);
#ifdef __OPENCL_CPP_VERSION__
int __ovld atomic_add(volatile int *p, int val);
uint __ovld atomic_add(volatile uint *p, uint val);
int __ovld atomic_add(volatile int *, int);
uint __ovld atomic_add(volatile uint *, uint);
#endif
#if defined(cl_khr_global_int32_base_atomics)
int __ovld atom_add(volatile __global int *p, int val);
uint __ovld atom_add(volatile __global uint *p, uint val);
int __ovld atom_add(volatile __global int *, int);
uint __ovld atom_add(volatile __global uint *, uint);
#endif
#if defined(cl_khr_local_int32_base_atomics)
int __ovld atom_add(volatile __local int *p, int val);
uint __ovld atom_add(volatile __local uint *p, uint val);
int __ovld atom_add(volatile __local int *, int);
uint __ovld atom_add(volatile __local uint *, uint);
#endif
#if defined(cl_khr_int64_base_atomics)
long __ovld atom_add(volatile __global long *p, long val);
ulong __ovld atom_add(volatile __global ulong *p, ulong val);
long __ovld atom_add(volatile __local long *p, long val);
ulong __ovld atom_add(volatile __local ulong *p, ulong val);
long __ovld atom_add(volatile __global long *, long);
ulong __ovld atom_add(volatile __global ulong *, ulong);
long __ovld atom_add(volatile __local long *, long);
ulong __ovld atom_add(volatile __local ulong *, ulong);
#endif
/**
@@ -12921,29 +12921,29 @@ ulong __ovld atom_add(volatile __local ulong *p, ulong val);
* Compute (old - val) and store result at location pointed by p. The function
* returns old.
*/
int __ovld atomic_sub(volatile __global int *p, int val);
uint __ovld atomic_sub(volatile __global uint *p, uint val);
int __ovld atomic_sub(volatile __local int *p, int val);
uint __ovld atomic_sub(volatile __local uint *p, uint val);
int __ovld atomic_sub(volatile __global int *, int);
uint __ovld atomic_sub(volatile __global uint *, uint);
int __ovld atomic_sub(volatile __local int *, int);
uint __ovld atomic_sub(volatile __local uint *, uint);
#ifdef __OPENCL_CPP_VERSION__
int __ovld atomic_sub(volatile int *p, int val);
uint __ovld atomic_sub(volatile uint *p, uint val);
int __ovld atomic_sub(volatile int *, int);
uint __ovld atomic_sub(volatile uint *, uint);
#endif
#if defined(cl_khr_global_int32_base_atomics)
int __ovld atom_sub(volatile __global int *p, int val);
uint __ovld atom_sub(volatile __global uint *p, uint val);
int __ovld atom_sub(volatile __global int *, int);
uint __ovld atom_sub(volatile __global uint *, uint);
#endif
#if defined(cl_khr_local_int32_base_atomics)
int __ovld atom_sub(volatile __local int *p, int val);
uint __ovld atom_sub(volatile __local uint *p, uint val);
int __ovld atom_sub(volatile __local int *, int);
uint __ovld atom_sub(volatile __local uint *, uint);
#endif
#if defined(cl_khr_int64_base_atomics)
long __ovld atom_sub(volatile __global long *p, long val);
ulong __ovld atom_sub(volatile __global ulong *p, ulong val);
long __ovld atom_sub(volatile __local long *p, long val);
ulong __ovld atom_sub(volatile __local ulong *p, ulong val);
long __ovld atom_sub(volatile __global long *, long);
ulong __ovld atom_sub(volatile __global ulong *, ulong);
long __ovld atom_sub(volatile __local long *, long);
ulong __ovld atom_sub(volatile __local ulong *, ulong);
#endif
/**
@@ -12951,32 +12951,32 @@ ulong __ovld atom_sub(volatile __local ulong *p, ulong val);
* with new value given by val. Returns old
* value.
*/
int __ovld atomic_xchg(volatile __global int *p, int val);
uint __ovld atomic_xchg(volatile __global uint *p, uint val);
int __ovld atomic_xchg(volatile __local int *p, int val);
uint __ovld atomic_xchg(volatile __local uint *p, uint val);
float __ovld atomic_xchg(volatile __global float *p, float val);
float __ovld atomic_xchg(volatile __local float *p, float val);
int __ovld atomic_xchg(volatile __global int *, int);
uint __ovld atomic_xchg(volatile __global uint *, uint);
int __ovld atomic_xchg(volatile __local int *, int);
uint __ovld atomic_xchg(volatile __local uint *, uint);
float __ovld atomic_xchg(volatile __global float *, float);
float __ovld atomic_xchg(volatile __local float *, float);
#ifdef __OPENCL_CPP_VERSION__
int __ovld atomic_xchg(volatile int *p, int val);
uint __ovld atomic_xchg(volatile uint *p, uint val);
float __ovld atomic_xchg(volatile float *p, float val);
int __ovld atomic_xchg(volatile int *, int);
uint __ovld atomic_xchg(volatile uint *, uint);
float __ovld atomic_xchg(volatile float *, float);
#endif
#if defined(cl_khr_global_int32_base_atomics)
int __ovld atom_xchg(volatile __global int *p, int val);
uint __ovld atom_xchg(volatile __global uint *p, uint val);
int __ovld atom_xchg(volatile __global int *, int);
uint __ovld atom_xchg(volatile __global uint *, uint);
#endif
#if defined(cl_khr_local_int32_base_atomics)
int __ovld atom_xchg(volatile __local int *p, int val);
uint __ovld atom_xchg(volatile __local uint *p, uint val);
int __ovld atom_xchg(volatile __local int *, int);
uint __ovld atom_xchg(volatile __local uint *, uint);
#endif
#if defined(cl_khr_int64_base_atomics)
long __ovld atom_xchg(volatile __global long *p, long val);
long __ovld atom_xchg(volatile __local long *p, long val);
ulong __ovld atom_xchg(volatile __global ulong *p, ulong val);
ulong __ovld atom_xchg(volatile __local ulong *p, ulong val);
long __ovld atom_xchg(volatile __global long *, long);
long __ovld atom_xchg(volatile __local long *, long);
ulong __ovld atom_xchg(volatile __global ulong *, ulong);
ulong __ovld atom_xchg(volatile __local ulong *, ulong);
#endif
/**
@@ -13048,29 +13048,29 @@ ulong __ovld atom_dec(volatile __local ulong *);
* location pointed by p. The function
* returns old.
*/
int __ovld atomic_cmpxchg(volatile __global int *p, int cmp, int val);
uint __ovld atomic_cmpxchg(volatile __global uint *p, uint cmp, uint val);
int __ovld atomic_cmpxchg(volatile __local int *p, int cmp, int val);
uint __ovld atomic_cmpxchg(volatile __local uint *p, uint cmp, uint val);
int __ovld atomic_cmpxchg(volatile __global int *, int, int);
uint __ovld atomic_cmpxchg(volatile __global uint *, uint, uint);
int __ovld atomic_cmpxchg(volatile __local int *, int, int);
uint __ovld atomic_cmpxchg(volatile __local uint *, uint, uint);
#ifdef __OPENCL_CPP_VERSION__
int __ovld atomic_cmpxchg(volatile int *p, int cmp, int val);
uint __ovld atomic_cmpxchg(volatile uint *p, uint cmp, uint val);
int __ovld atomic_cmpxchg(volatile int *, int, int);
uint __ovld atomic_cmpxchg(volatile uint *, uint, uint);
#endif
#if defined(cl_khr_global_int32_base_atomics)
int __ovld atom_cmpxchg(volatile __global int *p, int cmp, int val);
uint __ovld atom_cmpxchg(volatile __global uint *p, uint cmp, uint val);
int __ovld atom_cmpxchg(volatile __global int *, int, int);
uint __ovld atom_cmpxchg(volatile __global uint *, uint, uint);
#endif
#if defined(cl_khr_local_int32_base_atomics)
int __ovld atom_cmpxchg(volatile __local int *p, int cmp, int val);
uint __ovld atom_cmpxchg(volatile __local uint *p, uint cmp, uint val);
int __ovld atom_cmpxchg(volatile __local int *, int, int);
uint __ovld atom_cmpxchg(volatile __local uint *, uint, uint);
#endif
#if defined(cl_khr_int64_base_atomics)
long __ovld atom_cmpxchg(volatile __global long *p, long cmp, long val);
ulong __ovld atom_cmpxchg(volatile __global ulong *p, ulong cmp, ulong val);
long __ovld atom_cmpxchg(volatile __local long *p, long cmp, long val);
ulong __ovld atom_cmpxchg(volatile __local ulong *p, ulong cmp, ulong val);
long __ovld atom_cmpxchg(volatile __global long *, long, long);
ulong __ovld atom_cmpxchg(volatile __global ulong *, ulong, ulong);
long __ovld atom_cmpxchg(volatile __local long *, long, long);
ulong __ovld atom_cmpxchg(volatile __local ulong *, ulong, ulong);
#endif
/**
@@ -13080,29 +13080,29 @@ ulong __ovld atom_cmpxchg(volatile __local ulong *p, ulong cmp, ulong val);
* location pointed by p. The function
* returns old.
*/
int __ovld atomic_min(volatile __global int *p, int val);
uint __ovld atomic_min(volatile __global uint *p, uint val);
int __ovld atomic_min(volatile __local int *p, int val);
uint __ovld atomic_min(volatile __local uint *p, uint val);
int __ovld atomic_min(volatile __global int *, int);
uint __ovld atomic_min(volatile __global uint *, uint);
int __ovld atomic_min(volatile __local int *, int);
uint __ovld atomic_min(volatile __local uint *, uint);
#ifdef __OPENCL_CPP_VERSION__
int __ovld atomic_min(volatile int *p, int val);
uint __ovld atomic_min(volatile uint *p, uint val);
int __ovld atomic_min(volatile int *, int);
uint __ovld atomic_min(volatile uint *, uint);
#endif
#if defined(cl_khr_global_int32_extended_atomics)
int __ovld atom_min(volatile __global int *p, int val);
uint __ovld atom_min(volatile __global uint *p, uint val);
int __ovld atom_min(volatile __global int *, int);
uint __ovld atom_min(volatile __global uint *, uint);
#endif
#if defined(cl_khr_local_int32_extended_atomics)
int __ovld atom_min(volatile __local int *p, int val);
uint __ovld atom_min(volatile __local uint *p, uint val);
int __ovld atom_min(volatile __local int *, int);
uint __ovld atom_min(volatile __local uint *, uint);
#endif
#if defined(cl_khr_int64_extended_atomics)
long __ovld atom_min(volatile __global long *p, long val);
ulong __ovld atom_min(volatile __global ulong *p, ulong val);
long __ovld atom_min(volatile __local long *p, long val);
ulong __ovld atom_min(volatile __local ulong *p, ulong val);
long __ovld atom_min(volatile __global long *, long);
ulong __ovld atom_min(volatile __global ulong *, ulong);
long __ovld atom_min(volatile __local long *, long);
ulong __ovld atom_min(volatile __local ulong *, ulong);
#endif
/**
@@ -13112,29 +13112,29 @@ ulong __ovld atom_min(volatile __local ulong *p, ulong val);
* location pointed by p. The function
* returns old.
*/
int __ovld atomic_max(volatile __global int *p, int val);
uint __ovld atomic_max(volatile __global uint *p, uint val);
int __ovld atomic_max(volatile __local int *p, int val);
uint __ovld atomic_max(volatile __local uint *p, uint val);
int __ovld atomic_max(volatile __global int *, int);
uint __ovld atomic_max(volatile __global uint *, uint);
int __ovld atomic_max(volatile __local int *, int);
uint __ovld atomic_max(volatile __local uint *, uint);
#ifdef __OPENCL_CPP_VERSION__
int __ovld atomic_max(volatile int *p, int val);
uint __ovld atomic_max(volatile uint *p, uint val);
int __ovld atomic_max(volatile int *, int);
uint __ovld atomic_max(volatile uint *, uint);
#endif
#if defined(cl_khr_global_int32_extended_atomics)
int __ovld atom_max(volatile __global int *p, int val);
uint __ovld atom_max(volatile __global uint *p, uint val);
int __ovld atom_max(volatile __global int *, int);
uint __ovld atom_max(volatile __global uint *, uint);
#endif
#if defined(cl_khr_local_int32_extended_atomics)
int __ovld atom_max(volatile __local int *p, int val);
uint __ovld atom_max(volatile __local uint *p, uint val);
int __ovld atom_max(volatile __local int *, int);
uint __ovld atom_max(volatile __local uint *, uint);
#endif
#if defined(cl_khr_int64_extended_atomics)
long __ovld atom_max(volatile __global long *p, long val);
ulong __ovld atom_max(volatile __global ulong *p, ulong val);
long __ovld atom_max(volatile __local long *p, long val);
ulong __ovld atom_max(volatile __local ulong *p, ulong val);
long __ovld atom_max(volatile __global long *, long);
ulong __ovld atom_max(volatile __global ulong *, ulong);
long __ovld atom_max(volatile __local long *, long);
ulong __ovld atom_max(volatile __local ulong *, ulong);
#endif
/**
@@ -13143,29 +13143,29 @@ ulong __ovld atom_max(volatile __local ulong *p, ulong val);
* (old & val) and store result at location
* pointed by p. The function returns old.
*/
int __ovld atomic_and(volatile __global int *p, int val);
uint __ovld atomic_and(volatile __global uint *p, uint val);
int __ovld atomic_and(volatile __local int *p, int val);
uint __ovld atomic_and(volatile __local uint *p, uint val);
int __ovld atomic_and(volatile __global int *, int);
uint __ovld atomic_and(volatile __global uint *, uint);
int __ovld atomic_and(volatile __local int *, int);
uint __ovld atomic_and(volatile __local uint *, uint);
#ifdef __OPENCL_CPP_VERSION__
int __ovld atomic_and(volatile int *p, int val);
uint __ovld atomic_and(volatile uint *p, uint val);
int __ovld atomic_and(volatile int *, int);
uint __ovld atomic_and(volatile uint *, uint);
#endif
#if defined(cl_khr_global_int32_extended_atomics)
int __ovld atom_and(volatile __global int *p, int val);
uint __ovld atom_and(volatile __global uint *p, uint val);
int __ovld atom_and(volatile __global int *, int);
uint __ovld atom_and(volatile __global uint *, uint);
#endif
#if defined(cl_khr_local_int32_extended_atomics)
int __ovld atom_and(volatile __local int *p, int val);
uint __ovld atom_and(volatile __local uint *p, uint val);
int __ovld atom_and(volatile __local int *, int);
uint __ovld atom_and(volatile __local uint *, uint);
#endif
#if defined(cl_khr_int64_extended_atomics)
long __ovld atom_and(volatile __global long *p, long val);
ulong __ovld atom_and(volatile __global ulong *p, ulong val);
long __ovld atom_and(volatile __local long *p, long val);
ulong __ovld atom_and(volatile __local ulong *p, ulong val);
long __ovld atom_and(volatile __global long *, long);
ulong __ovld atom_and(volatile __global ulong *, ulong);
long __ovld atom_and(volatile __local long *, long);
ulong __ovld atom_and(volatile __local ulong *, ulong);
#endif
/**
@@ -13174,29 +13174,29 @@ ulong __ovld atom_and(volatile __local ulong *p, ulong val);
* (old | val) and store result at location
* pointed by p. The function returns old.
*/
int __ovld atomic_or(volatile __global int *p, int val);
uint __ovld atomic_or(volatile __global uint *p, uint val);
int __ovld atomic_or(volatile __local int *p, int val);
uint __ovld atomic_or(volatile __local uint *p, uint val);
int __ovld atomic_or(volatile __global int *, int);
uint __ovld atomic_or(volatile __global uint *, uint);
int __ovld atomic_or(volatile __local int *, int);
uint __ovld atomic_or(volatile __local uint *, uint);
#ifdef __OPENCL_CPP_VERSION__
int __ovld atomic_or(volatile int *p, int val);
uint __ovld atomic_or(volatile uint *p, uint val);
int __ovld atomic_or(volatile int *, int);
uint __ovld atomic_or(volatile uint *, uint);
#endif
#if defined(cl_khr_global_int32_extended_atomics)
int __ovld atom_or(volatile __global int *p, int val);
uint __ovld atom_or(volatile __global uint *p, uint val);
int __ovld atom_or(volatile __global int *, int);
uint __ovld atom_or(volatile __global uint *, uint);
#endif
#if defined(cl_khr_local_int32_extended_atomics)
int __ovld atom_or(volatile __local int *p, int val);
uint __ovld atom_or(volatile __local uint *p, uint val);
int __ovld atom_or(volatile __local int *, int);
uint __ovld atom_or(volatile __local uint *, uint);
#endif
#if defined(cl_khr_int64_extended_atomics)
long __ovld atom_or(volatile __global long *p, long val);
ulong __ovld atom_or(volatile __global ulong *p, ulong val);
long __ovld atom_or(volatile __local long *p, long val);
ulong __ovld atom_or(volatile __local ulong *p, ulong val);
long __ovld atom_or(volatile __global long *, long);
ulong __ovld atom_or(volatile __global ulong *, ulong);
long __ovld atom_or(volatile __local long *, long);
ulong __ovld atom_or(volatile __local ulong *, ulong);
#endif
/**
@@ -13205,29 +13205,29 @@ ulong __ovld atom_or(volatile __local ulong *p, ulong val);
* (old ^ val) and store result at location
* pointed by p. The function returns old.
*/
int __ovld atomic_xor(volatile __global int *p, int val);
uint __ovld atomic_xor(volatile __global uint *p, uint val);
int __ovld atomic_xor(volatile __local int *p, int val);
uint __ovld atomic_xor(volatile __local uint *p, uint val);
int __ovld atomic_xor(volatile __global int *, int);
uint __ovld atomic_xor(volatile __global uint *, uint);
int __ovld atomic_xor(volatile __local int *, int);
uint __ovld atomic_xor(volatile __local uint *, uint);
#ifdef __OPENCL_CPP_VERSION__
int __ovld atomic_xor(volatile int *p, int val);
uint __ovld atomic_xor(volatile uint *p, uint val);
int __ovld atomic_xor(volatile int *, int);
uint __ovld atomic_xor(volatile uint *, uint);
#endif
#if defined(cl_khr_global_int32_extended_atomics)
int __ovld atom_xor(volatile __global int *p, int val);
uint __ovld atom_xor(volatile __global uint *p, uint val);
int __ovld atom_xor(volatile __global int *, int);
uint __ovld atom_xor(volatile __global uint *, uint);
#endif
#if defined(cl_khr_local_int32_extended_atomics)
int __ovld atom_xor(volatile __local int *p, int val);
uint __ovld atom_xor(volatile __local uint *p, uint val);
int __ovld atom_xor(volatile __local int *, int);
uint __ovld atom_xor(volatile __local uint *, uint);
#endif
#if defined(cl_khr_int64_extended_atomics)
long __ovld atom_xor(volatile __global long *p, long val);
ulong __ovld atom_xor(volatile __global ulong *p, ulong val);
long __ovld atom_xor(volatile __local long *p, long val);
ulong __ovld atom_xor(volatile __local ulong *p, ulong val);
long __ovld atom_xor(volatile __global long *, long);
ulong __ovld atom_xor(volatile __global ulong *, ulong);
long __ovld atom_xor(volatile __local long *, long);
ulong __ovld atom_xor(volatile __local ulong *, ulong);
#endif
#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
@@ -15257,13 +15257,17 @@ float4 __ovld __purefn read_imagef(read_only image2d_t, sampler_t, float2, float
int4 __ovld __purefn read_imagei(read_only image2d_t, sampler_t, float2, float);
uint4 __ovld __purefn read_imageui(read_only image2d_t, sampler_t, float2, float);
#ifdef cl_khr_depth_images
float __ovld __purefn read_imagef(read_only image2d_depth_t, sampler_t, float2, float);
#endif // cl_khr_depth_images
float4 __ovld __purefn read_imagef(read_only image2d_array_t, sampler_t, float4, float);
int4 __ovld __purefn read_imagei(read_only image2d_array_t, sampler_t, float4, float);
uint4 __ovld __purefn read_imageui(read_only image2d_array_t, sampler_t, float4, float);
#ifdef cl_khr_depth_images
float __ovld __purefn read_imagef(read_only image2d_array_depth_t, sampler_t, float4, float);
#endif // cl_khr_depth_images
float4 __ovld __purefn read_imagef(read_only image3d_t, sampler_t, float4, float);
int4 __ovld __purefn read_imagei(read_only image3d_t, sampler_t, float4, float);
@@ -15281,13 +15285,17 @@ float4 __ovld __purefn read_imagef(read_only image2d_t, sampler_t, float2, float
int4 __ovld __purefn read_imagei(read_only image2d_t, sampler_t, float2, float2, float2);
uint4 __ovld __purefn read_imageui(read_only image2d_t, sampler_t, float2, float2, float2);
#ifdef cl_khr_depth_images
float __ovld __purefn read_imagef(read_only image2d_depth_t, sampler_t, float2, float2, float2);
#endif // cl_khr_depth_images
float4 __ovld __purefn read_imagef(read_only image2d_array_t, sampler_t, float4, float2, float2);
int4 __ovld __purefn read_imagei(read_only image2d_array_t, sampler_t, float4, float2, float2);
uint4 __ovld __purefn read_imageui(read_only image2d_array_t, sampler_t, float4, float2, float2);
#ifdef cl_khr_depth_images
float __ovld __purefn read_imagef(read_only image2d_array_depth_t, sampler_t, float4, float2, float2);
#endif // cl_khr_depth_images
float4 __ovld __purefn read_imagef(read_only image3d_t, sampler_t, float4, float4, float4);
int4 __ovld __purefn read_imagei(read_only image3d_t, sampler_t, float4, float4, float4);
@@ -15380,9 +15388,11 @@ float4 __ovld __purefn read_imagef(read_write image2d_array_t, int4);
int4 __ovld __purefn read_imagei(read_write image2d_array_t, int4);
uint4 __ovld __purefn read_imageui(read_write image2d_array_t, int4);
#ifdef cl_khr_3d_image_writes
float4 __ovld __purefn read_imagef(read_write image3d_t, int4);
int4 __ovld __purefn read_imagei(read_write image3d_t, int4);
uint4 __ovld __purefn read_imageui(read_write image3d_t, int4);
#endif // cl_khr_3d_image_writes
#ifdef cl_khr_depth_images
float __ovld __purefn read_imagef(read_write image2d_depth_t, int2);
@@ -15423,9 +15433,11 @@ uint4 __ovld __purefn read_imageui(read_write image2d_array_t, sampler_t, float4
float __ovld __purefn read_imagef(read_write image2d_array_depth_t, sampler_t, float4, float);
#ifdef cl_khr_3d_image_writes
float4 __ovld __purefn read_imagef(read_write image3d_t, sampler_t, float4, float);
int4 __ovld __purefn read_imagei(read_write image3d_t, sampler_t, float4, float);
uint4 __ovld __purefn read_imageui(read_write image3d_t, sampler_t, float4, float);
#endif // cl_khr_3d_image_writes
float4 __ovld __purefn read_imagef(read_write image1d_t, sampler_t, float, float, float);
int4 __ovld __purefn read_imagei(read_write image1d_t, sampler_t, float, float, float);
@@ -15447,9 +15459,11 @@ uint4 __ovld __purefn read_imageui(read_write image2d_array_t, sampler_t, float4
float __ovld __purefn read_imagef(read_write image2d_array_depth_t, sampler_t, float4, float2, float2);
#ifdef cl_khr_3d_image_writes
float4 __ovld __purefn read_imagef(read_write image3d_t, sampler_t, float4, float4, float4);
int4 __ovld __purefn read_imagei(read_write image3d_t, sampler_t, float4, float4, float4);
uint4 __ovld __purefn read_imageui(read_write image3d_t, sampler_t, float4, float4, float4);
#endif // cl_khr_3d_image_writes
#endif //cl_khr_mipmap_image
@@ -15457,7 +15471,9 @@ uint4 __ovld __purefn read_imageui(read_write image3d_t, sampler_t, float4, floa
#ifdef cl_khr_fp16
half4 __ovld __purefn read_imageh(read_write image1d_t, int);
half4 __ovld __purefn read_imageh(read_write image2d_t, int2);
#ifdef cl_khr_3d_image_writes
half4 __ovld __purefn read_imageh(read_write image3d_t, int4);
#endif // cl_khr_3d_image_writes
half4 __ovld __purefn read_imageh(read_write image1d_array_t, int2);
half4 __ovld __purefn read_imageh(read_write image2d_array_t, int4);
half4 __ovld __purefn read_imageh(read_write image1d_buffer_t, int);
@@ -15727,7 +15743,9 @@ int __ovld __cnfn get_image_width(write_only image2d_array_msaa_depth_t);
int __ovld __cnfn get_image_width(read_write image1d_t);
int __ovld __cnfn get_image_width(read_write image1d_buffer_t);
int __ovld __cnfn get_image_width(read_write image2d_t);
#ifdef cl_khr_3d_image_writes
int __ovld __cnfn get_image_width(read_write image3d_t);
#endif // cl_khr_3d_image_writes
int __ovld __cnfn get_image_width(read_write image1d_array_t);
int __ovld __cnfn get_image_width(read_write image2d_array_t);
#ifdef cl_khr_depth_images
@@ -15777,7 +15795,9 @@ int __ovld __cnfn get_image_height(write_only image2d_array_msaa_depth_t);
#if defined(__opencl_c_read_write_images)
int __ovld __cnfn get_image_height(read_write image2d_t);
#ifdef cl_khr_3d_image_writes
int __ovld __cnfn get_image_height(read_write image3d_t);
#endif // cl_khr_3d_image_writes
int __ovld __cnfn get_image_height(read_write image2d_array_t);
#ifdef cl_khr_depth_images
int __ovld __cnfn get_image_height(read_write image2d_depth_t);
@@ -15798,11 +15818,11 @@ int __ovld __cnfn get_image_depth(read_only image3d_t);
#ifdef cl_khr_3d_image_writes
int __ovld __cnfn get_image_depth(write_only image3d_t);
#endif
#if defined(__opencl_c_read_write_images)
int __ovld __cnfn get_image_depth(read_write image3d_t);
#endif //defined(__opencl_c_read_write_images)
#endif // cl_khr_3d_image_writes
// OpenCL Extension v2.0 s9.18 - Mipmaps
#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
@@ -15824,24 +15844,32 @@ int __ovld get_image_num_mip_levels(write_only image3d_t);
#if defined(__opencl_c_read_write_images)
int __ovld get_image_num_mip_levels(read_write image1d_t);
int __ovld get_image_num_mip_levels(read_write image2d_t);
#ifdef cl_khr_3d_image_writes
int __ovld get_image_num_mip_levels(read_write image3d_t);
#endif // cl_khr_3d_image_writes
#endif //defined(__opencl_c_read_write_images)
int __ovld get_image_num_mip_levels(read_only image1d_array_t);
int __ovld get_image_num_mip_levels(read_only image2d_array_t);
#ifdef cl_khr_depth_images
int __ovld get_image_num_mip_levels(read_only image2d_array_depth_t);
int __ovld get_image_num_mip_levels(read_only image2d_depth_t);
#endif // cl_khr_depth_images
int __ovld get_image_num_mip_levels(write_only image1d_array_t);
int __ovld get_image_num_mip_levels(write_only image2d_array_t);
#ifdef cl_khr_depth_images
int __ovld get_image_num_mip_levels(write_only image2d_array_depth_t);
int __ovld get_image_num_mip_levels(write_only image2d_depth_t);
#endif // cl_khr_depth_images
#if defined(__opencl_c_read_write_images)
int __ovld get_image_num_mip_levels(read_write image1d_array_t);
int __ovld get_image_num_mip_levels(read_write image2d_array_t);
#ifdef cl_khr_depth_images
int __ovld get_image_num_mip_levels(read_write image2d_array_depth_t);
int __ovld get_image_num_mip_levels(read_write image2d_depth_t);
#endif // cl_khr_depth_images
#endif //defined(__opencl_c_read_write_images)
#endif //cl_khr_mipmap_image
@@ -15906,7 +15934,9 @@ int __ovld __cnfn get_image_channel_data_type(write_only image2d_array_msaa_dept
int __ovld __cnfn get_image_channel_data_type(read_write image1d_t);
int __ovld __cnfn get_image_channel_data_type(read_write image1d_buffer_t);
int __ovld __cnfn get_image_channel_data_type(read_write image2d_t);
#ifdef cl_khr_3d_image_writes
int __ovld __cnfn get_image_channel_data_type(read_write image3d_t);
#endif // cl_khr_3d_image_writes
int __ovld __cnfn get_image_channel_data_type(read_write image1d_array_t);
int __ovld __cnfn get_image_channel_data_type(read_write image2d_array_t);
#ifdef cl_khr_depth_images
@@ -15978,7 +16008,9 @@ int __ovld __cnfn get_image_channel_order(write_only image2d_array_msaa_depth_t)
int __ovld __cnfn get_image_channel_order(read_write image1d_t);
int __ovld __cnfn get_image_channel_order(read_write image1d_buffer_t);
int __ovld __cnfn get_image_channel_order(read_write image2d_t);
#ifdef cl_khr_3d_image_writes
int __ovld __cnfn get_image_channel_order(read_write image3d_t);
#endif // cl_khr_3d_image_writes
int __ovld __cnfn get_image_channel_order(read_write image1d_array_t);
int __ovld __cnfn get_image_channel_order(read_write image2d_array_t);
#ifdef cl_khr_depth_images
@@ -16048,10 +16080,10 @@ int2 __ovld __cnfn get_image_dim(read_write image2d_array_msaa_depth_t);
int4 __ovld __cnfn get_image_dim(read_only image3d_t);
#ifdef cl_khr_3d_image_writes
int4 __ovld __cnfn get_image_dim(write_only image3d_t);
#endif
#if defined(__opencl_c_read_write_images)
int4 __ovld __cnfn get_image_dim(read_write image3d_t);
#endif //defined(__opencl_c_read_write_images)
#endif // cl_khr_3d_image_writes
/**
* Return the image array size.
@@ -16266,9 +16298,9 @@ uint __ovld get_enqueued_num_sub_groups(void);
uint __ovld get_sub_group_id(void);
uint __ovld get_sub_group_local_id(void);
void __ovld __conv sub_group_barrier(cl_mem_fence_flags flags);
void __ovld __conv sub_group_barrier(cl_mem_fence_flags);
#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
void __ovld __conv sub_group_barrier(cl_mem_fence_flags flags, memory_scope);
void __ovld __conv sub_group_barrier(cl_mem_fence_flags, memory_scope);
#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
int __ovld __conv sub_group_all(int predicate);
@@ -17847,15 +17879,13 @@ intel_sub_group_avc_sic_configure_skc(
uint skip_block_partition_type, uint skip_motion_vector_mask,
ulong motion_vectors, uchar bidirectional_weight, uchar skip_sad_adjustment,
intel_sub_group_avc_sic_payload_t payload);
intel_sub_group_avc_sic_payload_t __ovld
intel_sub_group_avc_sic_configure_ipe(
uchar luma_intra_partition_mask, uchar intra_neighbour_availabilty,
intel_sub_group_avc_sic_payload_t __ovld intel_sub_group_avc_sic_configure_ipe(
uchar luma_intra_partition_mask, uchar intra_neighbour_availability,
uchar left_edge_luma_pixels, uchar upper_left_corner_luma_pixel,
uchar upper_edge_luma_pixels, uchar upper_right_edge_luma_pixels,
uchar intra_sad_adjustment, intel_sub_group_avc_sic_payload_t payload);
intel_sub_group_avc_sic_payload_t __ovld
intel_sub_group_avc_sic_configure_ipe(
uchar luma_intra_partition_mask, uchar intra_neighbour_availabilty,
intel_sub_group_avc_sic_payload_t __ovld intel_sub_group_avc_sic_configure_ipe(
uchar luma_intra_partition_mask, uchar intra_neighbour_availability,
uchar left_edge_luma_pixels, uchar upper_left_corner_luma_pixel,
uchar upper_edge_luma_pixels, uchar upper_right_edge_luma_pixels,
ushort left_edge_chroma_pixels, ushort upper_left_corner_chroma_pixel,
+2 -2
View File
@@ -36,7 +36,7 @@
#ifndef EMMINTRIN_H_
#define EMMINTRIN_H_
#if defined(__ppc64__) && \
#if defined(__powerpc64__) && \
(defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
#include <altivec.h>
@@ -2262,7 +2262,7 @@ extern __inline __m128d
#else
#include_next <emmintrin.h>
#endif /* defined(__ppc64__) &&
#endif /* defined(__powerpc64__) && \
* (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
#endif /* EMMINTRIN_H_ */
+1 -1
View File
@@ -10,7 +10,7 @@
#ifndef _MM_MALLOC_H_INCLUDED
#define _MM_MALLOC_H_INCLUDED
#if defined(__ppc64__) && \
#if defined(__powerpc64__) && \
(defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
#include <stdlib.h>
+2 -2
View File
@@ -35,7 +35,7 @@
#ifndef _MMINTRIN_H_INCLUDED
#define _MMINTRIN_H_INCLUDED
#if defined(__ppc64__) && \
#if defined(__powerpc64__) && \
(defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
#include <altivec.h>
@@ -1447,7 +1447,7 @@ extern __inline __m64
#else
#include_next <mmintrin.h>
#endif /* defined(__ppc64__) &&
#endif /* defined(__powerpc64__) && \
* (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
#endif /* _MMINTRIN_H_INCLUDED */
+2 -2
View File
@@ -39,7 +39,7 @@
#ifndef PMMINTRIN_H_
#define PMMINTRIN_H_
#if defined(__ppc64__) && \
#if defined(__powerpc64__) && \
(defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
/* We need definitions from the SSE2 and SSE header files*/
@@ -139,7 +139,7 @@ extern __inline __m128i
#else
#include_next <pmmintrin.h>
#endif /* defined(__ppc64__) &&
#endif /* defined(__powerpc64__) && \
* (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
#endif /* PMMINTRIN_H_ */
+2 -2
View File
@@ -29,7 +29,7 @@
#ifndef SMMINTRIN_H_
#define SMMINTRIN_H_
#if defined(__ppc64__) && \
#if defined(__powerpc64__) && \
(defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
#include <altivec.h>
@@ -657,7 +657,7 @@ extern __inline __m128i
#else
#include_next <smmintrin.h>
#endif /* defined(__ppc64__) &&
#endif /* defined(__powerpc64__) && \
* (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
#endif /* SMMINTRIN_H_ */
+2 -2
View File
@@ -25,7 +25,7 @@
#ifndef TMMINTRIN_H_
#define TMMINTRIN_H_
#if defined(__ppc64__) && \
#if defined(__powerpc64__) && \
(defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
#include <altivec.h>
@@ -447,7 +447,7 @@ extern __inline __m64
#else
#include_next <tmmintrin.h>
#endif /* defined(__ppc64__) &&
#endif /* defined(__powerpc64__) && \
* (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
#endif /* TMMINTRIN_H_ */
+2 -2
View File
@@ -35,7 +35,7 @@
#ifndef XMMINTRIN_H_
#define XMMINTRIN_H_
#if defined(__ppc64__) && \
#if defined(__powerpc64__) && \
(defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
/* Define four value permute mask */
@@ -1821,7 +1821,7 @@ extern __inline void
#else
#include_next <xmmintrin.h>
#endif /* defined(__ppc64__) &&
#endif /* defined(__powerpc64__) && \
* (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
#endif /* XMMINTRIN_H_ */
+61
View File
@@ -0,0 +1,61 @@
/*===---- prfchiintrin.h - PREFETCHI intrinsic -----------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __PRFCHIINTRIN_H
#define __PRFCHIINTRIN_H
#ifdef __x86_64__
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("prefetchi")))
/// Loads an instruction sequence containing the specified memory address into
/// all level cache.
///
/// Note that the effect of this intrinsic is dependent on the processor
/// implementation.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c PREFETCHIT0 instruction.
///
/// \param __P
/// A pointer specifying the memory address to be prefetched.
static __inline__ void __DEFAULT_FN_ATTRS
_m_prefetchit0(volatile const void *__P) {
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wcast-qual"
__builtin_ia32_prefetchi((const void *)__P, 3 /* _MM_HINT_T0 */);
#pragma clang diagnostic pop
}
/// Loads an instruction sequence containing the specified memory address into
/// all but the first-level cache.
///
/// Note that the effect of this intrinsic is dependent on the processor
/// implementation.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c PREFETCHIT1 instruction.
///
/// \param __P
/// A pointer specifying the memory address to be prefetched.
static __inline__ void __DEFAULT_FN_ATTRS
_m_prefetchit1(volatile const void *__P) {
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wcast-qual"
__builtin_ia32_prefetchi((const void *)__P, 2 /* _MM_HINT_T1 */);
#pragma clang diagnostic pop
}
#endif /* __x86_64__ */
#undef __DEFAULT_FN_ATTRS
#endif /* __PRFCHWINTRIN_H */
+203
View File
@@ -0,0 +1,203 @@
/*===----------------------- raointintrin.h - RAOINT ------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __X86GPRINTRIN_H
#error "Never use <raointintrin.h> directly; include <x86gprintrin.h> instead."
#endif // __X86GPRINTRIN_H
#ifndef __RAOINTINTRIN_H
#define __RAOINTINTRIN_H
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("raoint")))
/// Atomically add a 32-bit value at memory operand \a __A and a 32-bit \a __B,
/// and store the result to the same memory location.
///
/// This intrinsic should be used for contention or weak ordering. It may
/// result in bad performance for hot data used by single thread only.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c AADD instruction.
///
/// \param __A
/// A pointer to a 32-bit memory location.
/// \param __B
/// A 32-bit integer value.
///
/// \code{.operation}
/// MEM[__A+31:__A] := MEM[__A+31:__A] + __B[31:0]
/// \endcode
static __inline__ void __DEFAULT_FN_ATTRS _aadd_i32(int *__A, int __B) {
__builtin_ia32_aadd32((int *)__A, __B);
}
/// Atomically and a 32-bit value at memory operand \a __A and a 32-bit \a __B,
/// and store the result to the same memory location.
///
/// This intrinsic should be used for contention or weak ordering. It may
/// result in bad performance for hot data used by single thread only.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c AAND instruction.
///
/// \param __A
/// A pointer to a 32-bit memory location.
/// \param __B
/// A 32-bit integer value.
///
/// \code{.operation}
/// MEM[__A+31:__A] := MEM[__A+31:__A] AND __B[31:0]
/// \endcode
static __inline__ void __DEFAULT_FN_ATTRS _aand_i32(int *__A, int __B) {
__builtin_ia32_aand32((int *)__A, __B);
}
/// Atomically or a 32-bit value at memory operand \a __A and a 32-bit \a __B,
/// and store the result to the same memory location.
///
/// This intrinsic should be used for contention or weak ordering. It may
/// result in bad performance for hot data used by single thread only.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c AOR instruction.
///
/// \param __A
/// A pointer to a 32-bit memory location.
/// \param __B
/// A 32-bit integer value.
///
/// \code{.operation}
/// MEM[__A+31:__A] := MEM[__A+31:__A] OR __B[31:0]
/// \endcode
static __inline__ void __DEFAULT_FN_ATTRS _aor_i32(int *__A, int __B) {
__builtin_ia32_aor32((int *)__A, __B);
}
/// Atomically xor a 32-bit value at memory operand \a __A and a 32-bit \a __B,
/// and store the result to the same memory location.
///
/// This intrinsic should be used for contention or weak ordering. It may
/// result in bad performance for hot data used by single thread only.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c AXOR instruction.
///
/// \param __A
/// A pointer to a 32-bit memory location.
/// \param __B
/// A 32-bit integer value.
///
/// \code{.operation}
/// MEM[__A+31:__A] := MEM[__A+31:__A] XOR __B[31:0]
/// \endcode
static __inline__ void __DEFAULT_FN_ATTRS _axor_i32(int *__A, int __B) {
__builtin_ia32_axor32((int *)__A, __B);
}
#ifdef __x86_64__
/// Atomically add a 64-bit value at memory operand \a __A and a 64-bit \a __B,
/// and store the result to the same memory location.
///
/// This intrinsic should be used for contention or weak ordering. It may
/// result in bad performance for hot data used by single thread only.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c AADD instruction.
///
/// \param __A
/// A pointer to a 64-bit memory location.
/// \param __B
/// A 64-bit integer value.
///
/// \code{.operation}
/// MEM[__A+63:__A] := MEM[__A+63:__A] + __B[63:0]
/// \endcode
static __inline__ void __DEFAULT_FN_ATTRS _aadd_i64(long long *__A,
long long __B) {
__builtin_ia32_aadd64((long long *)__A, __B);
}
/// Atomically and a 64-bit value at memory operand \a __A and a 64-bit \a __B,
/// and store the result to the same memory location.
///
/// This intrinsic should be used for contention or weak ordering. It may
/// result in bad performance for hot data used by single thread only.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c AAND instruction.
///
/// \param __A
/// A pointer to a 64-bit memory location.
/// \param __B
/// A 64-bit integer value.
///
/// \code{.operation}
/// MEM[__A+63:__A] := MEM[__A+63:__A] AND __B[63:0]
/// \endcode
static __inline__ void __DEFAULT_FN_ATTRS _aand_i64(long long *__A,
long long __B) {
__builtin_ia32_aand64((long long *)__A, __B);
}
/// Atomically or a 64-bit value at memory operand \a __A and a 64-bit \a __B,
/// and store the result to the same memory location.
///
/// This intrinsic should be used for contention or weak ordering. It may
/// result in bad performance for hot data used by single thread only.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c AOR instruction.
///
/// \param __A
/// A pointer to a 64-bit memory location.
/// \param __B
/// A 64-bit integer value.
///
/// \code{.operation}
/// MEM[__A+63:__A] := MEM[__A+63:__A] OR __B[63:0]
/// \endcode
static __inline__ void __DEFAULT_FN_ATTRS _aor_i64(long long *__A,
long long __B) {
__builtin_ia32_aor64((long long *)__A, __B);
}
/// Atomically xor a 64-bit value at memory operand \a __A and a 64-bit \a __B,
/// and store the result to the same memory location.
///
/// This intrinsic should be used for contention or weak ordering. It may
/// result in bad performance for hot data used by single thread only.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c AXOR instruction.
///
/// \param __A
/// A pointer to a 64-bit memory location.
/// \param __B
/// A 64-bit integer value.
///
/// \code{.operation}
/// MEM[__A+63:__A] := MEM[__A+63:__A] XOR __B[63:0]
/// \endcode
static __inline__ void __DEFAULT_FN_ATTRS _axor_i64(long long *__A,
long long __B) {
__builtin_ia32_axor64((long long *)__A, __B);
}
#endif // __x86_64__
#undef __DEFAULT_FN_ATTRS
#endif // __RAOINTINTRIN_H
+15 -6
View File
@@ -25,6 +25,8 @@ extern "C" {
#pragma clang riscv intrinsic vector
#define vlenb() __builtin_rvv_vlenb()
enum RVV_CSR {
RVV_VSTART = 0,
RVV_VXSAT,
@@ -70,7 +72,6 @@ void vwrite_csr(enum RVV_CSR __csr, unsigned long __value) {
}
}
#define vsetvl_e8mf8(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 5)
#define vsetvl_e8mf4(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 6)
#define vsetvl_e8mf2(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 7)
#define vsetvl_e8m1(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 0)
@@ -78,25 +79,28 @@ void vwrite_csr(enum RVV_CSR __csr, unsigned long __value) {
#define vsetvl_e8m4(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 2)
#define vsetvl_e8m8(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 3)
#define vsetvl_e16mf4(avl) __builtin_rvv_vsetvli((size_t)(avl), 1, 6)
#define vsetvl_e16mf2(avl) __builtin_rvv_vsetvli((size_t)(avl), 1, 7)
#define vsetvl_e16m1(avl) __builtin_rvv_vsetvli((size_t)(avl), 1, 0)
#define vsetvl_e16m2(avl) __builtin_rvv_vsetvli((size_t)(avl), 1, 1)
#define vsetvl_e16m4(avl) __builtin_rvv_vsetvli((size_t)(avl), 1, 2)
#define vsetvl_e16m8(avl) __builtin_rvv_vsetvli((size_t)(avl), 1, 3)
#define vsetvl_e32mf2(avl) __builtin_rvv_vsetvli((size_t)(avl), 2, 7)
#define vsetvl_e32m1(avl) __builtin_rvv_vsetvli((size_t)(avl), 2, 0)
#define vsetvl_e32m2(avl) __builtin_rvv_vsetvli((size_t)(avl), 2, 1)
#define vsetvl_e32m4(avl) __builtin_rvv_vsetvli((size_t)(avl), 2, 2)
#define vsetvl_e32m8(avl) __builtin_rvv_vsetvli((size_t)(avl), 2, 3)
#if __riscv_v_elen >= 64
#define vsetvl_e8mf8(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 5)
#define vsetvl_e16mf4(avl) __builtin_rvv_vsetvli((size_t)(avl), 1, 6)
#define vsetvl_e32mf2(avl) __builtin_rvv_vsetvli((size_t)(avl), 2, 7)
#define vsetvl_e64m1(avl) __builtin_rvv_vsetvli((size_t)(avl), 3, 0)
#define vsetvl_e64m2(avl) __builtin_rvv_vsetvli((size_t)(avl), 3, 1)
#define vsetvl_e64m4(avl) __builtin_rvv_vsetvli((size_t)(avl), 3, 2)
#define vsetvl_e64m8(avl) __builtin_rvv_vsetvli((size_t)(avl), 3, 3)
#endif
#define vsetvlmax_e8mf8() __builtin_rvv_vsetvlimax(0, 5)
#define vsetvlmax_e8mf4() __builtin_rvv_vsetvlimax(0, 6)
#define vsetvlmax_e8mf2() __builtin_rvv_vsetvlimax(0, 7)
#define vsetvlmax_e8m1() __builtin_rvv_vsetvlimax(0, 0)
@@ -104,23 +108,28 @@ void vwrite_csr(enum RVV_CSR __csr, unsigned long __value) {
#define vsetvlmax_e8m4() __builtin_rvv_vsetvlimax(0, 2)
#define vsetvlmax_e8m8() __builtin_rvv_vsetvlimax(0, 3)
#define vsetvlmax_e16mf4() __builtin_rvv_vsetvlimax(1, 6)
#define vsetvlmax_e16mf2() __builtin_rvv_vsetvlimax(1, 7)
#define vsetvlmax_e16m1() __builtin_rvv_vsetvlimax(1, 0)
#define vsetvlmax_e16m2() __builtin_rvv_vsetvlimax(1, 1)
#define vsetvlmax_e16m4() __builtin_rvv_vsetvlimax(1, 2)
#define vsetvlmax_e16m8() __builtin_rvv_vsetvlimax(1, 3)
#define vsetvlmax_e32mf2() __builtin_rvv_vsetvlimax(2, 7)
#define vsetvlmax_e32m1() __builtin_rvv_vsetvlimax(2, 0)
#define vsetvlmax_e32m2() __builtin_rvv_vsetvlimax(2, 1)
#define vsetvlmax_e32m4() __builtin_rvv_vsetvlimax(2, 2)
#define vsetvlmax_e32m8() __builtin_rvv_vsetvlimax(2, 3)
#if __riscv_v_elen >= 64
#define vsetvlmax_e8mf8() __builtin_rvv_vsetvlimax(0, 5)
#define vsetvlmax_e16mf4() __builtin_rvv_vsetvlimax(1, 6)
#define vsetvlmax_e32mf2() __builtin_rvv_vsetvlimax(2, 7)
#define vsetvlmax_e64m1() __builtin_rvv_vsetvlimax(3, 0)
#define vsetvlmax_e64m2() __builtin_rvv_vsetvlimax(3, 1)
#define vsetvlmax_e64m4() __builtin_rvv_vsetvlimax(3, 2)
#define vsetvlmax_e64m8() __builtin_rvv_vsetvlimax(3, 3)
#endif
typedef __rvv_bool64_t vbool64_t;
typedef __rvv_bool32_t vbool32_t;
typedef __rvv_bool16_t vbool16_t;
+1 -1
View File
@@ -818,7 +818,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu32(__m128i __V1,
/// parameter, is copied to the result.
/// \param N
/// Specifies which bits from operand \a Y will be copied, which bits in the
/// result they will be be copied to, and which bits in the result will be
/// result they will be copied to, and which bits in the result will be
/// cleared. The following assignments are made: \n
/// Bits [7:6] specify the bits to copy from operand \a Y: \n
/// 00: Selects bits [31:0] from operand \a Y. \n
+23 -7
View File
@@ -8,13 +8,30 @@
*/
#ifndef __STDARG_H
#define __STDARG_H
#ifndef __GNUC_VA_LIST
#define __GNUC_VA_LIST
typedef __builtin_va_list __gnuc_va_list;
#endif
#ifdef __need___va_list
#undef __need___va_list
#else
#define __STDARG_H
#ifndef _VA_LIST
typedef __builtin_va_list va_list;
#define _VA_LIST
#endif
/* FIXME: This is using the placeholder dates Clang produces for these macros
in C2x mode; switch to the correct values once they've been published. */
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
/* C2x does not require the second parameter for va_start. */
#define va_start(ap, ...) __builtin_va_start(ap, 0)
#else
/* Versions before C2x do require the second parameter. */
#define va_start(ap, param) __builtin_va_start(ap, param)
#endif
#define va_end(ap) __builtin_va_end(ap)
#define va_arg(ap, type) __builtin_va_arg(ap, type)
@@ -23,13 +40,12 @@ typedef __builtin_va_list va_list;
*/
#define __va_copy(d,s) __builtin_va_copy(d,s)
#if __STDC_VERSION__ >= 199901L || __cplusplus >= 201103L || !defined(__STRICT_ANSI__)
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \
(defined(__cplusplus) && __cplusplus >= 201103L) || \
!defined(__STRICT_ANSI__)
#define va_copy(dest, src) __builtin_va_copy(dest, src)
#endif
#ifndef __GNUC_VA_LIST
#define __GNUC_VA_LIST 1
typedef __builtin_va_list __gnuc_va_list;
#endif
#endif /* __STDARG_H */
#endif /* not __STDARG_H */
+6 -3
View File
@@ -15,10 +15,12 @@
*
* Exclude the MSVC path as well as the MSVC header as of the 14.31.30818
* explicitly disallows `stdatomic.h` in the C mode via an `#error`. Fallback
* to the clang resource header until that is fully supported.
* to the clang resource header until that is fully supported. The
* `stdatomic.h` header requires C++ 23 or newer.
*/
#if __STDC_HOSTED__ && \
__has_include_next(<stdatomic.h>) && !(defined(_MSC_VER) && !defined(__cplusplus))
__has_include_next(<stdatomic.h>) && \
(!defined(_MSC_VER) || (defined(__cplusplus) && __cplusplus >= 202002L))
# include_next <stdatomic.h>
#else
@@ -45,7 +47,8 @@ extern "C" {
/* 7.17.2 Initialization */
#define ATOMIC_VAR_INIT(value) (value)
#if (__STDC_VERSION__ >= 201710L || __cplusplus >= 202002L) && \
#if ((defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201710L) || \
(defined(__cplusplus) && __cplusplus >= 202002L)) && \
!defined(_CLANG_DISABLE_CRT_DEPRECATION_WARNINGS)
/* ATOMIC_VAR_INIT was deprecated in C17 and C++20. */
#pragma clang deprecated(ATOMIC_VAR_INIT)
+2 -2
View File
@@ -12,7 +12,7 @@
#define __bool_true_false_are_defined 1
#if __STDC_VERSION__ > 201710L
#if defined(__STDC_VERSION__) && __STDC_VERSION__ > 201710L
/* FIXME: We should be issuing a deprecation warning here, but cannot yet due
* to system headers which include this header file unconditionally.
*/
@@ -23,7 +23,7 @@
#elif defined(__GNUC__) && !defined(__STRICT_ANSI__)
/* Define _Bool as a GNU extension. */
#define _Bool bool
#if __cplusplus < 201103L
#if defined(__cplusplus) && __cplusplus < 201103L
/* For C++98, define bool, false, true as a GNU extension. */
#define bool bool
#define false false
+8 -1
View File
@@ -97,8 +97,15 @@ using ::std::nullptr_t;
#undef __need_NULL
#endif /* defined(__need_NULL) */
/* FIXME: This is using the placeholder dates Clang produces for these macros
in C2x mode; switch to the correct values once they've been published. */
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
typedef typeof(nullptr) nullptr_t;
#endif /* defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L */
#if defined(__need_STDDEF_H_misc)
#if __STDC_VERSION__ >= 201112L || __cplusplus >= 201103L
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) || \
(defined(__cplusplus) && __cplusplus >= 201103L)
#include "__stddef_max_align_t.h"
#endif
#define offsetof(t, d) __builtin_offsetof(t, d)
+158 -40
View File
@@ -96,13 +96,21 @@
typedef __INT64_TYPE__ int64_t;
# endif /* __int8_t_defined */
typedef __UINT64_TYPE__ uint64_t;
# undef __int_least64_t
# define __int_least64_t int64_t
# undef __uint_least64_t
# define __uint_least64_t uint64_t
# undef __int_least32_t
# define __int_least32_t int64_t
# undef __uint_least32_t
# define __uint_least32_t uint64_t
# undef __int_least16_t
# define __int_least16_t int64_t
# undef __uint_least16_t
# define __uint_least16_t uint64_t
# undef __int_least8_t
# define __int_least8_t int64_t
# undef __uint_least8_t
# define __uint_least8_t uint64_t
#endif /* __INT64_TYPE__ */
@@ -120,11 +128,17 @@ typedef int56_t int_least56_t;
typedef uint56_t uint_least56_t;
typedef int56_t int_fast56_t;
typedef uint56_t uint_fast56_t;
# undef __int_least32_t
# define __int_least32_t int56_t
# undef __uint_least32_t
# define __uint_least32_t uint56_t
# undef __int_least16_t
# define __int_least16_t int56_t
# undef __uint_least16_t
# define __uint_least16_t uint56_t
# undef __int_least8_t
# define __int_least8_t int56_t
# undef __uint_least8_t
# define __uint_least8_t uint56_t
#endif /* __INT56_TYPE__ */
@@ -136,11 +150,17 @@ typedef int48_t int_least48_t;
typedef uint48_t uint_least48_t;
typedef int48_t int_fast48_t;
typedef uint48_t uint_fast48_t;
# undef __int_least32_t
# define __int_least32_t int48_t
# undef __uint_least32_t
# define __uint_least32_t uint48_t
# undef __int_least16_t
# define __int_least16_t int48_t
# undef __uint_least16_t
# define __uint_least16_t uint48_t
# undef __int_least8_t
# define __int_least8_t int48_t
# undef __uint_least8_t
# define __uint_least8_t uint48_t
#endif /* __INT48_TYPE__ */
@@ -152,11 +172,17 @@ typedef int40_t int_least40_t;
typedef uint40_t uint_least40_t;
typedef int40_t int_fast40_t;
typedef uint40_t uint_fast40_t;
# undef __int_least32_t
# define __int_least32_t int40_t
# undef __uint_least32_t
# define __uint_least32_t uint40_t
# undef __int_least16_t
# define __int_least16_t int40_t
# undef __uint_least16_t
# define __uint_least16_t uint40_t
# undef __int_least8_t
# define __int_least8_t int40_t
# undef __uint_least8_t
# define __uint_least8_t uint40_t
#endif /* __INT40_TYPE__ */
@@ -172,11 +198,17 @@ typedef __INT32_TYPE__ int32_t;
typedef __UINT32_TYPE__ uint32_t;
# endif /* __uint32_t_defined */
# undef __int_least32_t
# define __int_least32_t int32_t
# undef __uint_least32_t
# define __uint_least32_t uint32_t
# undef __int_least16_t
# define __int_least16_t int32_t
# undef __uint_least16_t
# define __uint_least16_t uint32_t
# undef __int_least8_t
# define __int_least8_t int32_t
# undef __uint_least8_t
# define __uint_least8_t uint32_t
#endif /* __INT32_TYPE__ */
@@ -194,9 +226,13 @@ typedef int24_t int_least24_t;
typedef uint24_t uint_least24_t;
typedef int24_t int_fast24_t;
typedef uint24_t uint_fast24_t;
# undef __int_least16_t
# define __int_least16_t int24_t
# undef __uint_least16_t
# define __uint_least16_t uint24_t
# undef __int_least8_t
# define __int_least8_t int24_t
# undef __uint_least8_t
# define __uint_least8_t uint24_t
#endif /* __INT24_TYPE__ */
@@ -205,9 +241,13 @@ typedef uint24_t uint_fast24_t;
typedef __INT16_TYPE__ int16_t;
#endif /* __int8_t_defined */
typedef __UINT16_TYPE__ uint16_t;
# undef __int_least16_t
# define __int_least16_t int16_t
# undef __uint_least16_t
# define __uint_least16_t uint16_t
# undef __int_least8_t
# define __int_least8_t int16_t
# undef __uint_least8_t
# define __uint_least8_t uint16_t
#endif /* __INT16_TYPE__ */
@@ -224,7 +264,9 @@ typedef __uint_least16_t uint_fast16_t;
typedef __INT8_TYPE__ int8_t;
#endif /* __int8_t_defined */
typedef __UINT8_TYPE__ uint8_t;
# undef __int_least8_t
# define __int_least8_t int8_t
# undef __uint_least8_t
# define __uint_least8_t uint8_t
#endif /* __INT8_TYPE__ */
@@ -285,16 +327,15 @@ typedef __UINTMAX_TYPE__ uintmax_t;
#ifdef __INT64_TYPE__
# undef __int64_c_suffix
# undef __int32_c_suffix
# undef __int16_c_suffix
# undef __int8_c_suffix
# ifdef __INT64_C_SUFFIX__
# define __int64_c_suffix __INT64_C_SUFFIX__
# define __int32_c_suffix __INT64_C_SUFFIX__
# define __int16_c_suffix __INT64_C_SUFFIX__
# define __int8_c_suffix __INT64_C_SUFFIX__
# else
# undef __int64_c_suffix
# undef __int32_c_suffix
# undef __int16_c_suffix
# undef __int8_c_suffix
# endif /* __INT64_C_SUFFIX__ */
#endif /* __INT64_TYPE__ */
@@ -310,6 +351,9 @@ typedef __UINTMAX_TYPE__ uintmax_t;
#ifdef __INT56_TYPE__
# undef __int32_c_suffix
# undef __int16_c_suffix
# undef __int8_c_suffix
# ifdef __INT56_C_SUFFIX__
# define INT56_C(v) __int_c(v, __INT56_C_SUFFIX__)
# define UINT56_C(v) __uint_c(v, __INT56_C_SUFFIX__)
@@ -319,14 +363,14 @@ typedef __UINTMAX_TYPE__ uintmax_t;
# else
# define INT56_C(v) v
# define UINT56_C(v) v ## U
# undef __int32_c_suffix
# undef __int16_c_suffix
# undef __int8_c_suffix
# endif /* __INT56_C_SUFFIX__ */
#endif /* __INT56_TYPE__ */
#ifdef __INT48_TYPE__
# undef __int32_c_suffix
# undef __int16_c_suffix
# undef __int8_c_suffix
# ifdef __INT48_C_SUFFIX__
# define INT48_C(v) __int_c(v, __INT48_C_SUFFIX__)
# define UINT48_C(v) __uint_c(v, __INT48_C_SUFFIX__)
@@ -336,14 +380,14 @@ typedef __UINTMAX_TYPE__ uintmax_t;
# else
# define INT48_C(v) v
# define UINT48_C(v) v ## U
# undef __int32_c_suffix
# undef __int16_c_suffix
# undef __int8_c_suffix
# endif /* __INT48_C_SUFFIX__ */
#endif /* __INT48_TYPE__ */
#ifdef __INT40_TYPE__
# undef __int32_c_suffix
# undef __int16_c_suffix
# undef __int8_c_suffix
# ifdef __INT40_C_SUFFIX__
# define INT40_C(v) __int_c(v, __INT40_C_SUFFIX__)
# define UINT40_C(v) __uint_c(v, __INT40_C_SUFFIX__)
@@ -353,22 +397,18 @@ typedef __UINTMAX_TYPE__ uintmax_t;
# else
# define INT40_C(v) v
# define UINT40_C(v) v ## U
# undef __int32_c_suffix
# undef __int16_c_suffix
# undef __int8_c_suffix
# endif /* __INT40_C_SUFFIX__ */
#endif /* __INT40_TYPE__ */
#ifdef __INT32_TYPE__
# undef __int32_c_suffix
# undef __int16_c_suffix
# undef __int8_c_suffix
# ifdef __INT32_C_SUFFIX__
# define __int32_c_suffix __INT32_C_SUFFIX__
# define __int16_c_suffix __INT32_C_SUFFIX__
# define __int8_c_suffix __INT32_C_SUFFIX__
#else
# undef __int32_c_suffix
# undef __int16_c_suffix
# undef __int8_c_suffix
# endif /* __INT32_C_SUFFIX__ */
#endif /* __INT32_TYPE__ */
@@ -384,6 +424,8 @@ typedef __UINTMAX_TYPE__ uintmax_t;
#ifdef __INT24_TYPE__
# undef __int16_c_suffix
# undef __int8_c_suffix
# ifdef __INT24_C_SUFFIX__
# define INT24_C(v) __int_c(v, __INT24_C_SUFFIX__)
# define UINT24_C(v) __uint_c(v, __INT24_C_SUFFIX__)
@@ -392,19 +434,16 @@ typedef __UINTMAX_TYPE__ uintmax_t;
# else
# define INT24_C(v) v
# define UINT24_C(v) v ## U
# undef __int16_c_suffix
# undef __int8_c_suffix
# endif /* __INT24_C_SUFFIX__ */
#endif /* __INT24_TYPE__ */
#ifdef __INT16_TYPE__
# undef __int16_c_suffix
# undef __int8_c_suffix
# ifdef __INT16_C_SUFFIX__
# define __int16_c_suffix __INT16_C_SUFFIX__
# define __int8_c_suffix __INT16_C_SUFFIX__
#else
# undef __int16_c_suffix
# undef __int8_c_suffix
# endif /* __INT16_C_SUFFIX__ */
#endif /* __INT16_TYPE__ */
@@ -420,10 +459,9 @@ typedef __UINTMAX_TYPE__ uintmax_t;
#ifdef __INT8_TYPE__
# undef __int8_c_suffix
# ifdef __INT8_C_SUFFIX__
# define __int8_c_suffix __INT8_C_SUFFIX__
#else
# undef __int8_c_suffix
# endif /* __INT8_C_SUFFIX__ */
#endif /* __INT8_TYPE__ */
@@ -463,27 +501,39 @@ typedef __UINTMAX_TYPE__ uintmax_t;
# define UINT64_MAX UINT64_C(18446744073709551615)
/* FIXME: This is using the placeholder dates Clang produces for these macros
in C2x mode; switch to the correct values once they've been published. */
#if __STDC_VERSION__ >= 202000L
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
# define UINT64_WIDTH 64
# define INT64_WIDTH UINT64_WIDTH
# define __UINT_LEAST64_WIDTH UINT64_WIDTH
# undef __UINT_LEAST32_WIDTH
# define __UINT_LEAST32_WIDTH UINT64_WIDTH
# undef __UINT_LEAST16_WIDTH
# define __UINT_LEAST16_WIDTH UINT64_WIDTH
# undef __UINT_LEAST8_MAX
# define __UINT_LEAST8_MAX UINT64_MAX
#endif /* __STDC_VERSION__ */
# define __INT_LEAST64_MIN INT64_MIN
# define __INT_LEAST64_MAX INT64_MAX
# define __UINT_LEAST64_MAX UINT64_MAX
# undef __INT_LEAST32_MIN
# define __INT_LEAST32_MIN INT64_MIN
# undef __INT_LEAST32_MAX
# define __INT_LEAST32_MAX INT64_MAX
# undef __UINT_LEAST32_MAX
# define __UINT_LEAST32_MAX UINT64_MAX
# undef __INT_LEAST16_MIN
# define __INT_LEAST16_MIN INT64_MIN
# undef __INT_LEAST16_MAX
# define __INT_LEAST16_MAX INT64_MAX
# undef __UINT_LEAST16_MAX
# define __UINT_LEAST16_MAX UINT64_MAX
# undef __INT_LEAST8_MIN
# define __INT_LEAST8_MIN INT64_MIN
# undef __INT_LEAST8_MAX
# define __INT_LEAST8_MAX INT64_MAX
# undef __UINT_LEAST8_MAX
# define __UINT_LEAST8_MAX UINT64_MAX
#endif /* __INT64_TYPE__ */
@@ -497,7 +547,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
/* FIXME: This is using the placeholder dates Clang produces for these macros
in C2x mode; switch to the correct values once they've been published. */
#if __STDC_VERSION__ >= 202000L
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
# define UINT_LEAST64_WIDTH __UINT_LEAST64_WIDTH
# define INT_LEAST64_WIDTH UINT_LEAST64_WIDTH
# define UINT_FAST64_WIDTH __UINT_LEAST64_WIDTH
@@ -517,27 +567,39 @@ typedef __UINTMAX_TYPE__ uintmax_t;
# define INT_FAST56_MAX INT56_MAX
# define UINT_FAST56_MAX UINT56_MAX
# undef __INT_LEAST32_MIN
# define __INT_LEAST32_MIN INT56_MIN
# undef __INT_LEAST32_MAX
# define __INT_LEAST32_MAX INT56_MAX
# undef __UINT_LEAST32_MAX
# define __UINT_LEAST32_MAX UINT56_MAX
# undef __INT_LEAST16_MIN
# define __INT_LEAST16_MIN INT56_MIN
# undef __INT_LEAST16_MAX
# define __INT_LEAST16_MAX INT56_MAX
# undef __UINT_LEAST16_MAX
# define __UINT_LEAST16_MAX UINT56_MAX
# undef __INT_LEAST8_MIN
# define __INT_LEAST8_MIN INT56_MIN
# undef __INT_LEAST8_MAX
# define __INT_LEAST8_MAX INT56_MAX
# undef __UINT_LEAST8_MAX
# define __UINT_LEAST8_MAX UINT56_MAX
/* FIXME: This is using the placeholder dates Clang produces for these macros
in C2x mode; switch to the correct values once they've been published. */
#if __STDC_VERSION__ >= 202000L
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
# define UINT56_WIDTH 56
# define INT56_WIDTH UINT56_WIDTH
# define UINT_LEAST56_WIDTH UINT56_WIDTH
# define INT_LEAST56_WIDTH UINT_LEAST56_WIDTH
# define UINT_FAST56_WIDTH UINT56_WIDTH
# define INT_FAST56_WIDTH UINT_FAST56_WIDTH
# undef __UINT_LEAST32_WIDTH
# define __UINT_LEAST32_WIDTH UINT56_WIDTH
# undef __UINT_LEAST16_WIDTH
# define __UINT_LEAST16_WIDTH UINT56_WIDTH
# undef __UINT_LEAST8_WIDTH
# define __UINT_LEAST8_WIDTH UINT56_WIDTH
#endif /* __STDC_VERSION__ */
#endif /* __INT56_TYPE__ */
@@ -554,27 +616,39 @@ typedef __UINTMAX_TYPE__ uintmax_t;
# define INT_FAST48_MAX INT48_MAX
# define UINT_FAST48_MAX UINT48_MAX
# undef __INT_LEAST32_MIN
# define __INT_LEAST32_MIN INT48_MIN
# undef __INT_LEAST32_MAX
# define __INT_LEAST32_MAX INT48_MAX
# undef __UINT_LEAST32_MAX
# define __UINT_LEAST32_MAX UINT48_MAX
# undef __INT_LEAST16_MIN
# define __INT_LEAST16_MIN INT48_MIN
# undef __INT_LEAST16_MAX
# define __INT_LEAST16_MAX INT48_MAX
# undef __UINT_LEAST16_MAX
# define __UINT_LEAST16_MAX UINT48_MAX
# undef __INT_LEAST8_MIN
# define __INT_LEAST8_MIN INT48_MIN
# undef __INT_LEAST8_MAX
# define __INT_LEAST8_MAX INT48_MAX
# undef __UINT_LEAST8_MAX
# define __UINT_LEAST8_MAX UINT48_MAX
/* FIXME: This is using the placeholder dates Clang produces for these macros
in C2x mode; switch to the correct values once they've been published. */
#if __STDC_VERSION__ >= 202000L
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
#define UINT48_WIDTH 48
#define INT48_WIDTH UINT48_WIDTH
#define UINT_LEAST48_WIDTH UINT48_WIDTH
#define INT_LEAST48_WIDTH UINT_LEAST48_WIDTH
#define UINT_FAST48_WIDTH UINT48_WIDTH
#define INT_FAST48_WIDTH UINT_FAST48_WIDTH
#undef __UINT_LEAST32_WIDTH
#define __UINT_LEAST32_WIDTH UINT48_WIDTH
# undef __UINT_LEAST16_WIDTH
#define __UINT_LEAST16_WIDTH UINT48_WIDTH
# undef __UINT_LEAST8_WIDTH
#define __UINT_LEAST8_WIDTH UINT48_WIDTH
#endif /* __STDC_VERSION__ */
#endif /* __INT48_TYPE__ */
@@ -591,27 +665,39 @@ typedef __UINTMAX_TYPE__ uintmax_t;
# define INT_FAST40_MAX INT40_MAX
# define UINT_FAST40_MAX UINT40_MAX
# undef __INT_LEAST32_MIN
# define __INT_LEAST32_MIN INT40_MIN
# undef __INT_LEAST32_MAX
# define __INT_LEAST32_MAX INT40_MAX
# undef __UINT_LEAST32_MAX
# define __UINT_LEAST32_MAX UINT40_MAX
# undef __INT_LEAST16_MIN
# define __INT_LEAST16_MIN INT40_MIN
# undef __INT_LEAST16_MAX
# define __INT_LEAST16_MAX INT40_MAX
# undef __UINT_LEAST16_MAX
# define __UINT_LEAST16_MAX UINT40_MAX
# undef __INT_LEAST8_MIN
# define __INT_LEAST8_MIN INT40_MIN
# undef __INT_LEAST8_MAX
# define __INT_LEAST8_MAX INT40_MAX
# undef __UINT_LEAST8_MAX
# define __UINT_LEAST8_MAX UINT40_MAX
/* FIXME: This is using the placeholder dates Clang produces for these macros
in C2x mode; switch to the correct values once they've been published. */
#if __STDC_VERSION__ >= 202000L
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
# define UINT40_WIDTH 40
# define INT40_WIDTH UINT40_WIDTH
# define UINT_LEAST40_WIDTH UINT40_WIDTH
# define INT_LEAST40_WIDTH UINT_LEAST40_WIDTH
# define UINT_FAST40_WIDTH UINT40_WIDTH
# define INT_FAST40_WIDTH UINT_FAST40_WIDTH
# undef __UINT_LEAST32_WIDTH
# define __UINT_LEAST32_WIDTH UINT40_WIDTH
# undef __UINT_LEAST16_WIDTH
# define __UINT_LEAST16_WIDTH UINT40_WIDTH
# undef __UINT_LEAST8_WIDTH
# define __UINT_LEAST8_WIDTH UINT40_WIDTH
#endif /* __STDC_VERSION__ */
#endif /* __INT40_TYPE__ */
@@ -622,23 +708,35 @@ typedef __UINTMAX_TYPE__ uintmax_t;
# define INT32_MIN (-INT32_C(2147483647)-1)
# define UINT32_MAX UINT32_C(4294967295)
# undef __INT_LEAST32_MIN
# define __INT_LEAST32_MIN INT32_MIN
# undef __INT_LEAST32_MAX
# define __INT_LEAST32_MAX INT32_MAX
# undef __UINT_LEAST32_MAX
# define __UINT_LEAST32_MAX UINT32_MAX
# undef __INT_LEAST16_MIN
# define __INT_LEAST16_MIN INT32_MIN
# undef __INT_LEAST16_MAX
# define __INT_LEAST16_MAX INT32_MAX
# undef __UINT_LEAST16_MAX
# define __UINT_LEAST16_MAX UINT32_MAX
# undef __INT_LEAST8_MIN
# define __INT_LEAST8_MIN INT32_MIN
# undef __INT_LEAST8_MAX
# define __INT_LEAST8_MAX INT32_MAX
# undef __UINT_LEAST8_MAX
# define __UINT_LEAST8_MAX UINT32_MAX
/* FIXME: This is using the placeholder dates Clang produces for these macros
in C2x mode; switch to the correct values once they've been published. */
#if __STDC_VERSION__ >= 202000L
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
# define UINT32_WIDTH 32
# define INT32_WIDTH UINT32_WIDTH
# undef __UINT_LEAST32_WIDTH
# define __UINT_LEAST32_WIDTH UINT32_WIDTH
# undef __UINT_LEAST16_WIDTH
# define __UINT_LEAST16_WIDTH UINT32_WIDTH
# undef __UINT_LEAST8_WIDTH
# define __UINT_LEAST8_WIDTH UINT32_WIDTH
#endif /* __STDC_VERSION__ */
#endif /* __INT32_TYPE__ */
@@ -653,7 +751,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
/* FIXME: This is using the placeholder dates Clang produces for these macros
in C2x mode; switch to the correct values once they've been published. */
#if __STDC_VERSION__ >= 202000L
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
# define UINT_LEAST32_WIDTH __UINT_LEAST32_WIDTH
# define INT_LEAST32_WIDTH UINT_LEAST32_WIDTH
# define UINT_FAST32_WIDTH __UINT_LEAST32_WIDTH
@@ -673,23 +771,31 @@ typedef __UINTMAX_TYPE__ uintmax_t;
# define INT_FAST24_MAX INT24_MAX
# define UINT_FAST24_MAX UINT24_MAX
# undef __INT_LEAST16_MIN
# define __INT_LEAST16_MIN INT24_MIN
# undef __INT_LEAST16_MAX
# define __INT_LEAST16_MAX INT24_MAX
# undef __UINT_LEAST16_MAX
# define __UINT_LEAST16_MAX UINT24_MAX
# undef __INT_LEAST8_MIN
# define __INT_LEAST8_MIN INT24_MIN
# undef __INT_LEAST8_MAX
# define __INT_LEAST8_MAX INT24_MAX
# undef __UINT_LEAST8_MAX
# define __UINT_LEAST8_MAX UINT24_MAX
/* FIXME: This is using the placeholder dates Clang produces for these macros
in C2x mode; switch to the correct values once they've been published. */
#if __STDC_VERSION__ >= 202000L
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
# define UINT24_WIDTH 24
# define INT24_WIDTH UINT24_WIDTH
# define UINT_LEAST24_WIDTH UINT24_WIDTH
# define INT_LEAST24_WIDTH UINT_LEAST24_WIDTH
# define UINT_FAST24_WIDTH UINT24_WIDTH
# define INT_FAST24_WIDTH UINT_FAST24_WIDTH
# undef __UINT_LEAST16_WIDTH
# define __UINT_LEAST16_WIDTH UINT24_WIDTH
# undef __UINT_LEAST8_WIDTH
# define __UINT_LEAST8_WIDTH UINT24_WIDTH
#endif /* __STDC_VERSION__ */
#endif /* __INT24_TYPE__ */
@@ -700,19 +806,27 @@ typedef __UINTMAX_TYPE__ uintmax_t;
#define INT16_MIN (-INT16_C(32767)-1)
#define UINT16_MAX UINT16_C(65535)
# undef __INT_LEAST16_MIN
# define __INT_LEAST16_MIN INT16_MIN
# undef __INT_LEAST16_MAX
# define __INT_LEAST16_MAX INT16_MAX
# undef __UINT_LEAST16_MAX
# define __UINT_LEAST16_MAX UINT16_MAX
# undef __INT_LEAST8_MIN
# define __INT_LEAST8_MIN INT16_MIN
# undef __INT_LEAST8_MAX
# define __INT_LEAST8_MAX INT16_MAX
# undef __UINT_LEAST8_MAX
# define __UINT_LEAST8_MAX UINT16_MAX
/* FIXME: This is using the placeholder dates Clang produces for these macros
in C2x mode; switch to the correct values once they've been published. */
#if __STDC_VERSION__ >= 202000L
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
# define UINT16_WIDTH 16
# define INT16_WIDTH UINT16_WIDTH
# undef __UINT_LEAST16_WIDTH
# define __UINT_LEAST16_WIDTH UINT16_WIDTH
# undef __UINT_LEAST8_WIDTH
# define __UINT_LEAST8_WIDTH UINT16_WIDTH
#endif /* __STDC_VERSION__ */
#endif /* __INT16_TYPE__ */
@@ -727,7 +841,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
/* FIXME: This is using the placeholder dates Clang produces for these macros
in C2x mode; switch to the correct values once they've been published. */
#if __STDC_VERSION__ >= 202000L
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
# define UINT_LEAST16_WIDTH __UINT_LEAST16_WIDTH
# define INT_LEAST16_WIDTH UINT_LEAST16_WIDTH
# define UINT_FAST16_WIDTH __UINT_LEAST16_WIDTH
@@ -741,15 +855,19 @@ typedef __UINTMAX_TYPE__ uintmax_t;
# define INT8_MIN (-INT8_C(127)-1)
# define UINT8_MAX UINT8_C(255)
# undef __INT_LEAST8_MIN
# define __INT_LEAST8_MIN INT8_MIN
# undef __INT_LEAST8_MAX
# define __INT_LEAST8_MAX INT8_MAX
# undef __UINT_LEAST8_MAX
# define __UINT_LEAST8_MAX UINT8_MAX
/* FIXME: This is using the placeholder dates Clang produces for these macros
in C2x mode; switch to the correct values once they've been published. */
#if __STDC_VERSION__ >= 202000L
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
# define UINT8_WIDTH 8
# define INT8_WIDTH UINT8_WIDTH
# undef __UINT_LEAST8_WIDTH
# define __UINT_LEAST8_WIDTH UINT8_WIDTH
#endif /* __STDC_VERSION__ */
#endif /* __INT8_TYPE__ */
@@ -764,7 +882,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
/* FIXME: This is using the placeholder dates Clang produces for these macros
in C2x mode; switch to the correct values once they've been published. */
#if __STDC_VERSION__ >= 202000L
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
# define UINT_LEAST8_WIDTH __UINT_LEAST8_WIDTH
# define INT_LEAST8_WIDTH UINT_LEAST8_WIDTH
# define UINT_FAST8_WIDTH __UINT_LEAST8_WIDTH
@@ -792,7 +910,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
/* C2x 7.20.2.4 Width of integer types capable of holding object pointers. */
/* FIXME: This is using the placeholder dates Clang produces for these macros
in C2x mode; switch to the correct values once they've been published. */
#if __STDC_VERSION__ >= 202000L
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
/* NB: The C standard requires that these be the same value, but the compiler
exposes separate internal width macros. */
#define INTPTR_WIDTH __INTPTR_WIDTH__
@@ -813,7 +931,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
/* C2x 7.20.2.5 Width of greatest-width integer types. */
/* FIXME: This is using the placeholder dates Clang produces for these macros
in C2x mode; switch to the correct values once they've been published. */
#if __STDC_VERSION__ >= 202000L
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
/* NB: The C standard requires that these be the same value, but the compiler
exposes separate internal width macros. */
#define INTMAX_WIDTH __INTMAX_WIDTH__
@@ -849,7 +967,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
/* C2x 7.20.3.x Width of other integer types. */
/* FIXME: This is using the placeholder dates Clang produces for these macros
in C2x mode; switch to the correct values once they've been published. */
#if __STDC_VERSION__ >= 202000L
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
#define PTRDIFF_WIDTH __PTRDIFF_WIDTH__
#define SIG_ATOMIC_WIDTH __SIG_ATOMIC_WIDTH__
#define SIZE_WIDTH __SIZE_WIDTH__
+1 -1
View File
@@ -13,7 +13,7 @@
#define noreturn _Noreturn
#define __noreturn_is_defined 1
#if __STDC_VERSION__ > 201710L && \
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ > 201710L) && \
!defined(_CLANG_DISABLE_CRT_DEPRECATION_WARNINGS)
/* The noreturn macro is deprecated in C2x. We do not mark it as such because
including the header file in C2x is also deprecated and we do not want to
+2 -1
View File
@@ -65,7 +65,8 @@ struct _Unwind_Context;
#if defined(__arm__) && !(defined(__USING_SJLJ_EXCEPTIONS__) || \
defined(__ARM_DWARF_EH__) || defined(__SEH__))
struct _Unwind_Control_Block;
typedef struct _Unwind_Control_Block _Unwind_Exception; /* Alias */
typedef struct _Unwind_Control_Block _Unwind_Control_Block;
#define _Unwind_Exception _Unwind_Control_Block /* Alias */
#else
struct _Unwind_Exception;
typedef struct _Unwind_Exception _Unwind_Exception;
+1 -1
View File
@@ -13,7 +13,7 @@
typedef double __vr __attribute__((__vector_size__(2048)));
// Vector mask registers
#if __STDC_VERSION__ >= 199901L
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
// For C99
typedef _Bool __vm __attribute__((ext_vector_type(256)));
typedef _Bool __vm256 __attribute__((ext_vector_type(256)));
+19 -7
View File
@@ -25,23 +25,35 @@
#include <crc32intrin.h>
#endif
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
defined(__PRFCHI__)
#include <prfchiintrin.h>
#endif
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
defined(__RAOINT__)
#include <raointintrin.h>
#endif
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
defined(__CMPCCXADD__)
#include <cmpccxaddintrin.h>
#endif
#if defined(__i386__)
#define __FULLBX "ebx"
#define __SAVE_GPRBX "mov {%%ebx, %%eax |eax, ebx};"
#define __RESTORE_GPRBX "mov {%%eax, %%ebx |ebx, eax};"
#define __TMPGPR "eax"
#else
// When in 64-bit target, the 32-bit operands generate a 32-bit result,
// zero-extended to a 64-bit result in the destination general-purpose,
// It means "mov x %ebx" will clobber the higher 32 bits of rbx, so we
// should preserve the 64-bit register rbx.
#define __FULLBX "rbx"
#define __SAVE_GPRBX "mov {%%rbx, %%rax |rax, rbx};"
#define __RESTORE_GPRBX "mov {%%rax, %%rbx |rbx, rax};"
#define __TMPGPR "rax"
#endif
#define __MOVEGPR(__r1, __r2) "mov {%%"__r1 ", %%"__r2 "|"__r2 ", "__r1"};"
#define __SAVE_GPRBX __MOVEGPR(__FULLBX, __TMPGPR)
#define __RESTORE_GPRBX __MOVEGPR(__TMPGPR, __FULLBX)
#define __SSC_MARK(__Tag) \
__asm__ __volatile__( __SAVE_GPRBX \
"mov {%0, %%ebx|ebx, %0}; " \
+1 -2
View File
@@ -1906,7 +1906,7 @@ _mm_setr_ps(float __z, float __y, float __x, float __w)
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_setzero_ps(void)
{
return __extension__ (__m128){ 0, 0, 0, 0 };
return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
}
/// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
@@ -3005,7 +3005,6 @@ do { \
#define _m_pavgw _mm_avg_pu16
#define _m_psadbw _mm_sad_pu8
#define _m_ _mm_
#define _m_ _mm_
#undef __DEFAULT_FN_ATTRS
#undef __DEFAULT_FN_ATTRS_MMX