update C headers to llvm9

upstream commit 1931d3cb20a00da732c5210b123656632982fde0
This commit is contained in:
Andrew Kelley
2019-07-19 16:50:45 -04:00
parent 70da0762f7
commit 2117fbdae3
130 changed files with 9519 additions and 3542 deletions
+3 -17
View File
@@ -1,22 +1,8 @@
/*===---- cuda_builtin_vars.h - CUDA built-in variables ---------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+31 -18
View File
@@ -1,22 +1,8 @@
/*===---- __clang_cuda_cmath.h - Device-side CUDA cmath support ------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
@@ -44,12 +30,32 @@
// implementation. Declaring in the global namespace and pulling into namespace
// std covers all of the known knowns.
#ifdef _OPENMP
#define __DEVICE__ static __attribute__((always_inline))
#else
#define __DEVICE__ static __device__ __inline__ __attribute__((always_inline))
#endif
// For C++ 17 we need to include noexcept attribute to be compatible
// with the header-defined version. This may be removed once
// variant is supported.
#if defined(_OPENMP) && defined(__cplusplus) && __cplusplus >= 201703L
#define __NOEXCEPT noexcept
#else
#define __NOEXCEPT
#endif
#if !(defined(_OPENMP) && defined(__cplusplus))
__DEVICE__ long long abs(long long __n) { return ::llabs(__n); }
__DEVICE__ long abs(long __n) { return ::labs(__n); }
__DEVICE__ float abs(float __x) { return ::fabsf(__x); }
__DEVICE__ double abs(double __x) { return ::fabs(__x); }
#endif
// TODO: remove once variat is supported.
#if defined(_OPENMP) && defined(__cplusplus)
__DEVICE__ const float abs(const float __x) { return ::fabsf((float)__x); }
__DEVICE__ const double abs(const double __x) { return ::fabs((double)__x); }
#endif
__DEVICE__ float acos(float __x) { return ::acosf(__x); }
__DEVICE__ float asin(float __x) { return ::asinf(__x); }
__DEVICE__ float atan(float __x) { return ::atanf(__x); }
@@ -58,9 +64,11 @@ __DEVICE__ float ceil(float __x) { return ::ceilf(__x); }
__DEVICE__ float cos(float __x) { return ::cosf(__x); }
__DEVICE__ float cosh(float __x) { return ::coshf(__x); }
__DEVICE__ float exp(float __x) { return ::expf(__x); }
__DEVICE__ float fabs(float __x) { return ::fabsf(__x); }
__DEVICE__ float fabs(float __x) __NOEXCEPT { return ::fabsf(__x); }
__DEVICE__ float floor(float __x) { return ::floorf(__x); }
__DEVICE__ float fmod(float __x, float __y) { return ::fmodf(__x, __y); }
// TODO: remove when variant is supported
#ifndef _OPENMP
__DEVICE__ int fpclassify(float __x) {
return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
FP_ZERO, __x);
@@ -69,6 +77,7 @@ __DEVICE__ int fpclassify(double __x) {
return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
FP_ZERO, __x);
}
#endif
__DEVICE__ float frexp(float __arg, int *__exp) {
return ::frexpf(__arg, __exp);
}
@@ -448,7 +457,10 @@ using ::remainderf;
using ::remquof;
using ::rintf;
using ::roundf;
// TODO: remove once variant is supported
#ifndef _OPENMP
using ::scalblnf;
#endif
using ::scalbnf;
using ::sinf;
using ::sinhf;
@@ -467,6 +479,7 @@ _GLIBCXX_END_NAMESPACE_VERSION
} // namespace std
#endif
#undef __NOEXCEPT
#undef __DEVICE__
#endif
+3 -17
View File
@@ -1,22 +1,8 @@
/*===-- __clang_cuda_complex_builtins - CUDA impls of runtime complex fns ---===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+59 -34
View File
@@ -1,22 +1,8 @@
/*===---- __clang_cuda_device_functions.h - CUDA runtime support -----------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
@@ -24,15 +10,21 @@
#ifndef __CLANG_CUDA_DEVICE_FUNCTIONS_H__
#define __CLANG_CUDA_DEVICE_FUNCTIONS_H__
#ifndef _OPENMP
#if CUDA_VERSION < 9000
#error This file is intended to be used with CUDA-9+ only.
#endif
#endif
// __DEVICE__ is a helper macro with common set of attributes for the wrappers
// we implement in this file. We need static in order to avoid emitting unused
// functions and __forceinline__ helps inlining these wrappers at -O1.
#pragma push_macro("__DEVICE__")
#ifdef _OPENMP
#define __DEVICE__ static __attribute__((always_inline))
#else
#define __DEVICE__ static __device__ __forceinline__
#endif
// libdevice provides fast low precision and slow full-recision implementations
// for some functions. Which one gets selected depends on
@@ -45,6 +37,15 @@
#define __FAST_OR_SLOW(fast, slow) slow
#endif
// For C++ 17 we need to include noexcept attribute to be compatible
// with the header-defined version. This may be removed once
// variant is supported.
#if defined(_OPENMP) && defined(__cplusplus) && __cplusplus >= 201703L
#define __NOEXCEPT noexcept
#else
#define __NOEXCEPT
#endif
__DEVICE__ int __all(int __a) { return __nvvm_vote_all(__a); }
__DEVICE__ int __any(int __a) { return __nvvm_vote_any(__a); }
__DEVICE__ unsigned int __ballot(int __a) { return __nvvm_vote_ballot(__a); }
@@ -52,8 +53,13 @@ __DEVICE__ unsigned int __brev(unsigned int __a) { return __nv_brev(__a); }
__DEVICE__ unsigned long long __brevll(unsigned long long __a) {
return __nv_brevll(__a);
}
#if defined(__cplusplus)
__DEVICE__ void __brkpt() { asm volatile("brkpt;"); }
__DEVICE__ void __brkpt(int __a) { __brkpt(); }
#else
__DEVICE__ void __attribute__((overloadable)) __brkpt(void) { asm volatile("brkpt;"); }
__DEVICE__ void __attribute__((overloadable)) __brkpt(int __a) { __brkpt(); }
#endif
__DEVICE__ unsigned int __byte_perm(unsigned int __a, unsigned int __b,
unsigned int __c) {
return __nv_byte_perm(__a, __b, __c);
@@ -237,6 +243,9 @@ __DEVICE__ int __ffs(int __a) { return __nv_ffs(__a); }
__DEVICE__ int __ffsll(long long __a) { return __nv_ffsll(__a); }
__DEVICE__ int __finite(double __a) { return __nv_isfinited(__a); }
__DEVICE__ int __finitef(float __a) { return __nv_finitef(__a); }
#ifdef _MSC_VER
__DEVICE__ int __finitel(long double __a);
#endif
__DEVICE__ int __float2int_rd(float __a) { return __nv_float2int_rd(__a); }
__DEVICE__ int __float2int_rn(float __a) { return __nv_float2int_rn(__a); }
__DEVICE__ int __float2int_ru(float __a) { return __nv_float2int_ru(__a); }
@@ -445,8 +454,14 @@ __DEVICE__ float __int_as_float(int __a) { return __nv_int_as_float(__a); }
__DEVICE__ int __isfinited(double __a) { return __nv_isfinited(__a); }
__DEVICE__ int __isinf(double __a) { return __nv_isinfd(__a); }
__DEVICE__ int __isinff(float __a) { return __nv_isinff(__a); }
#ifdef _MSC_VER
__DEVICE__ int __isinfl(long double __a);
#endif
__DEVICE__ int __isnan(double __a) { return __nv_isnand(__a); }
__DEVICE__ int __isnanf(float __a) { return __nv_isnanf(__a); }
#ifdef _MSC_VER
__DEVICE__ int __isnanl(long double __a);
#endif
__DEVICE__ double __ll2double_rd(long long __a) {
return __nv_ll2double_rd(__a);
}
@@ -520,8 +535,8 @@ __DEVICE__ unsigned int __sad(int __a, int __b, unsigned int __c) {
__DEVICE__ float __saturatef(float __a) { return __nv_saturatef(__a); }
__DEVICE__ int __signbitd(double __a) { return __nv_signbitd(__a); }
__DEVICE__ int __signbitf(float __a) { return __nv_signbitf(__a); }
__DEVICE__ void __sincosf(float __a, float *__sptr, float *__cptr) {
return __nv_fast_sincosf(__a, __sptr, __cptr);
__DEVICE__ void __sincosf(float __a, float *__s, float *__c) {
return __nv_fast_sincosf(__a, __s, __c);
}
__DEVICE__ float __sinf(float __a) { return __nv_fast_sinf(__a); }
__DEVICE__ int __syncthreads_and(int __a) { return __nvvm_bar0_and(__a); }
@@ -1468,7 +1483,8 @@ __DEVICE__ unsigned int __vsubus4(unsigned int __a, unsigned int __b) {
return r;
}
#endif // CUDA_VERSION >= 9020
__DEVICE__ int abs(int __a) { return __nv_abs(__a); }
__DEVICE__ int abs(int __a) __NOEXCEPT { return __nv_abs(__a); }
__DEVICE__ double fabs(double __a) __NOEXCEPT { return __nv_fabs(__a); }
__DEVICE__ double acos(double __a) { return __nv_acos(__a); }
__DEVICE__ float acosf(float __a) { return __nv_acosf(__a); }
__DEVICE__ double acosh(double __a) { return __nv_acosh(__a); }
@@ -1487,8 +1503,10 @@ __DEVICE__ double cbrt(double __a) { return __nv_cbrt(__a); }
__DEVICE__ float cbrtf(float __a) { return __nv_cbrtf(__a); }
__DEVICE__ double ceil(double __a) { return __nv_ceil(__a); }
__DEVICE__ float ceilf(float __a) { return __nv_ceilf(__a); }
#ifndef _OPENMP
__DEVICE__ int clock() { return __nvvm_read_ptx_sreg_clock(); }
__DEVICE__ long long clock64() { return __nvvm_read_ptx_sreg_clock64(); }
#endif
__DEVICE__ double copysign(double __a, double __b) {
return __nv_copysign(__a, __b);
}
@@ -1525,7 +1543,6 @@ __DEVICE__ float exp2f(float __a) { return __nv_exp2f(__a); }
__DEVICE__ float expf(float __a) { return __nv_expf(__a); }
__DEVICE__ double expm1(double __a) { return __nv_expm1(__a); }
__DEVICE__ float expm1f(float __a) { return __nv_expm1f(__a); }
__DEVICE__ double fabs(double __a) { return __nv_fabs(__a); }
__DEVICE__ float fabsf(float __a) { return __nv_fabsf(__a); }
__DEVICE__ double fdim(double __a, double __b) { return __nv_fdim(__a, __b); }
__DEVICE__ float fdimf(float __a, float __b) { return __nv_fdimf(__a, __b); }
@@ -1563,16 +1580,16 @@ __DEVICE__ double j1(double __a) { return __nv_j1(__a); }
__DEVICE__ float j1f(float __a) { return __nv_j1f(__a); }
__DEVICE__ double jn(int __n, double __a) { return __nv_jn(__n, __a); }
__DEVICE__ float jnf(int __n, float __a) { return __nv_jnf(__n, __a); }
#if defined(__LP64__)
__DEVICE__ long labs(long __a) { return llabs(__a); };
#if defined(__LP64__) || defined(_WIN64)
__DEVICE__ long labs(long __a) __NOEXCEPT { return __nv_llabs(__a); };
#else
__DEVICE__ long labs(long __a) { return __nv_abs(__a); };
__DEVICE__ long labs(long __a) __NOEXCEPT { return __nv_abs(__a); };
#endif
__DEVICE__ double ldexp(double __a, int __b) { return __nv_ldexp(__a, __b); }
__DEVICE__ float ldexpf(float __a, int __b) { return __nv_ldexpf(__a, __b); }
__DEVICE__ double lgamma(double __a) { return __nv_lgamma(__a); }
__DEVICE__ float lgammaf(float __a) { return __nv_lgammaf(__a); }
__DEVICE__ long long llabs(long long __a) { return __nv_llabs(__a); }
__DEVICE__ long long llabs(long long __a) __NOEXCEPT { return __nv_llabs(__a); }
__DEVICE__ long long llmax(long long __a, long long __b) {
return __nv_llmax(__a, __b);
}
@@ -1597,7 +1614,7 @@ __DEVICE__ float logbf(float __a) { return __nv_logbf(__a); }
__DEVICE__ float logf(float __a) {
return __FAST_OR_SLOW(__nv_fast_logf, __nv_logf)(__a);
}
#if defined(__LP64__)
#if defined(__LP64__) || defined(_WIN64)
__DEVICE__ long lrint(double __a) { return llrint(__a); }
__DEVICE__ long lrintf(float __a) { return __float2ll_rn(__a); }
__DEVICE__ long lround(double __a) { return llround(__a); }
@@ -1609,12 +1626,16 @@ __DEVICE__ long lround(double __a) { return round(__a); }
__DEVICE__ long lroundf(float __a) { return roundf(__a); }
#endif
__DEVICE__ int max(int __a, int __b) { return __nv_max(__a, __b); }
// These functions shouldn't be declared when including this header
// for math function resolution purposes.
#ifndef _OPENMP
__DEVICE__ void *memcpy(void *__a, const void *__b, size_t __c) {
return __builtin_memcpy(__a, __b, __c);
}
__DEVICE__ void *memset(void *__a, int __b, size_t __c) {
return __builtin_memset(__a, __b, __c);
}
#endif
__DEVICE__ int min(int __a, int __b) { return __nv_min(__a, __b); }
__DEVICE__ double modf(double __a, double *__b) { return __nv_modf(__a, __b); }
__DEVICE__ float modff(float __a, float *__b) { return __nv_modff(__a, __b); }
@@ -1698,6 +1719,8 @@ __DEVICE__ double rsqrt(double __a) { return __nv_rsqrt(__a); }
__DEVICE__ float rsqrtf(float __a) { return __nv_rsqrtf(__a); }
__DEVICE__ double scalbn(double __a, int __b) { return __nv_scalbn(__a, __b); }
__DEVICE__ float scalbnf(float __a, int __b) { return __nv_scalbnf(__a, __b); }
// TODO: remove once variant is supported
#ifndef _OPENMP
__DEVICE__ double scalbln(double __a, long __b) {
if (__b > INT_MAX)
return __a > 0 ? HUGE_VAL : -HUGE_VAL;
@@ -1712,18 +1735,19 @@ __DEVICE__ float scalblnf(float __a, long __b) {
return __a > 0 ? 0.f : -0.f;
return scalbnf(__a, (int)__b);
}
#endif
__DEVICE__ double sin(double __a) { return __nv_sin(__a); }
__DEVICE__ void sincos(double __a, double *__sptr, double *__cptr) {
return __nv_sincos(__a, __sptr, __cptr);
__DEVICE__ void sincos(double __a, double *__s, double *__c) {
return __nv_sincos(__a, __s, __c);
}
__DEVICE__ void sincosf(float __a, float *__sptr, float *__cptr) {
return __FAST_OR_SLOW(__nv_fast_sincosf, __nv_sincosf)(__a, __sptr, __cptr);
__DEVICE__ void sincosf(float __a, float *__s, float *__c) {
return __FAST_OR_SLOW(__nv_fast_sincosf, __nv_sincosf)(__a, __s, __c);
}
__DEVICE__ void sincospi(double __a, double *__sptr, double *__cptr) {
return __nv_sincospi(__a, __sptr, __cptr);
__DEVICE__ void sincospi(double __a, double *__s, double *__c) {
return __nv_sincospi(__a, __s, __c);
}
__DEVICE__ void sincospif(float __a, float *__sptr, float *__cptr) {
return __nv_sincospif(__a, __sptr, __cptr);
__DEVICE__ void sincospif(float __a, float *__s, float *__c) {
return __nv_sincospif(__a, __s, __c);
}
__DEVICE__ float sinf(float __a) {
return __FAST_OR_SLOW(__nv_fast_sinf, __nv_sinf)(__a);
@@ -1763,6 +1787,7 @@ __DEVICE__ float y1f(float __a) { return __nv_y1f(__a); }
__DEVICE__ double yn(int __a, double __b) { return __nv_yn(__a, __b); }
__DEVICE__ float ynf(int __a, float __b) { return __nv_ynf(__a, __b); }
#undef __NOEXCEPT
#pragma pop_macro("__DEVICE__")
#pragma pop_macro("__FAST_OR_SLOW")
#endif // __CLANG_CUDA_DEVICE_FUNCTIONS_H__
+3 -17
View File
@@ -1,22 +1,8 @@
/*===--- __clang_cuda_intrinsics.h - Device-side CUDA intrinsic wrappers ---===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+443 -447
View File
@@ -1,22 +1,8 @@
/*===-- __clang_cuda_libdevice_declares.h - decls for libdevice functions --===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
@@ -24,443 +10,453 @@
#ifndef __CLANG_CUDA_LIBDEVICE_DECLARES_H__
#define __CLANG_CUDA_LIBDEVICE_DECLARES_H__
#if defined(__cplusplus)
extern "C" {
#endif
__device__ int __nv_abs(int __a);
__device__ double __nv_acos(double __a);
__device__ float __nv_acosf(float __a);
__device__ double __nv_acosh(double __a);
__device__ float __nv_acoshf(float __a);
__device__ double __nv_asin(double __a);
__device__ float __nv_asinf(float __a);
__device__ double __nv_asinh(double __a);
__device__ float __nv_asinhf(float __a);
__device__ double __nv_atan2(double __a, double __b);
__device__ float __nv_atan2f(float __a, float __b);
__device__ double __nv_atan(double __a);
__device__ float __nv_atanf(float __a);
__device__ double __nv_atanh(double __a);
__device__ float __nv_atanhf(float __a);
__device__ int __nv_brev(int __a);
__device__ long long __nv_brevll(long long __a);
__device__ int __nv_byte_perm(int __a, int __b, int __c);
__device__ double __nv_cbrt(double __a);
__device__ float __nv_cbrtf(float __a);
__device__ double __nv_ceil(double __a);
__device__ float __nv_ceilf(float __a);
__device__ int __nv_clz(int __a);
__device__ int __nv_clzll(long long __a);
__device__ double __nv_copysign(double __a, double __b);
__device__ float __nv_copysignf(float __a, float __b);
__device__ double __nv_cos(double __a);
__device__ float __nv_cosf(float __a);
__device__ double __nv_cosh(double __a);
__device__ float __nv_coshf(float __a);
__device__ double __nv_cospi(double __a);
__device__ float __nv_cospif(float __a);
__device__ double __nv_cyl_bessel_i0(double __a);
__device__ float __nv_cyl_bessel_i0f(float __a);
__device__ double __nv_cyl_bessel_i1(double __a);
__device__ float __nv_cyl_bessel_i1f(float __a);
__device__ double __nv_dadd_rd(double __a, double __b);
__device__ double __nv_dadd_rn(double __a, double __b);
__device__ double __nv_dadd_ru(double __a, double __b);
__device__ double __nv_dadd_rz(double __a, double __b);
__device__ double __nv_ddiv_rd(double __a, double __b);
__device__ double __nv_ddiv_rn(double __a, double __b);
__device__ double __nv_ddiv_ru(double __a, double __b);
__device__ double __nv_ddiv_rz(double __a, double __b);
__device__ double __nv_dmul_rd(double __a, double __b);
__device__ double __nv_dmul_rn(double __a, double __b);
__device__ double __nv_dmul_ru(double __a, double __b);
__device__ double __nv_dmul_rz(double __a, double __b);
__device__ float __nv_double2float_rd(double __a);
__device__ float __nv_double2float_rn(double __a);
__device__ float __nv_double2float_ru(double __a);
__device__ float __nv_double2float_rz(double __a);
__device__ int __nv_double2hiint(double __a);
__device__ int __nv_double2int_rd(double __a);
__device__ int __nv_double2int_rn(double __a);
__device__ int __nv_double2int_ru(double __a);
__device__ int __nv_double2int_rz(double __a);
__device__ long long __nv_double2ll_rd(double __a);
__device__ long long __nv_double2ll_rn(double __a);
__device__ long long __nv_double2ll_ru(double __a);
__device__ long long __nv_double2ll_rz(double __a);
__device__ int __nv_double2loint(double __a);
__device__ unsigned int __nv_double2uint_rd(double __a);
__device__ unsigned int __nv_double2uint_rn(double __a);
__device__ unsigned int __nv_double2uint_ru(double __a);
__device__ unsigned int __nv_double2uint_rz(double __a);
__device__ unsigned long long __nv_double2ull_rd(double __a);
__device__ unsigned long long __nv_double2ull_rn(double __a);
__device__ unsigned long long __nv_double2ull_ru(double __a);
__device__ unsigned long long __nv_double2ull_rz(double __a);
__device__ unsigned long long __nv_double_as_longlong(double __a);
__device__ double __nv_drcp_rd(double __a);
__device__ double __nv_drcp_rn(double __a);
__device__ double __nv_drcp_ru(double __a);
__device__ double __nv_drcp_rz(double __a);
__device__ double __nv_dsqrt_rd(double __a);
__device__ double __nv_dsqrt_rn(double __a);
__device__ double __nv_dsqrt_ru(double __a);
__device__ double __nv_dsqrt_rz(double __a);
__device__ double __nv_dsub_rd(double __a, double __b);
__device__ double __nv_dsub_rn(double __a, double __b);
__device__ double __nv_dsub_ru(double __a, double __b);
__device__ double __nv_dsub_rz(double __a, double __b);
__device__ double __nv_erfc(double __a);
__device__ float __nv_erfcf(float __a);
__device__ double __nv_erfcinv(double __a);
__device__ float __nv_erfcinvf(float __a);
__device__ double __nv_erfcx(double __a);
__device__ float __nv_erfcxf(float __a);
__device__ double __nv_erf(double __a);
__device__ float __nv_erff(float __a);
__device__ double __nv_erfinv(double __a);
__device__ float __nv_erfinvf(float __a);
__device__ double __nv_exp10(double __a);
__device__ float __nv_exp10f(float __a);
__device__ double __nv_exp2(double __a);
__device__ float __nv_exp2f(float __a);
__device__ double __nv_exp(double __a);
__device__ float __nv_expf(float __a);
__device__ double __nv_expm1(double __a);
__device__ float __nv_expm1f(float __a);
__device__ double __nv_fabs(double __a);
__device__ float __nv_fabsf(float __a);
__device__ float __nv_fadd_rd(float __a, float __b);
__device__ float __nv_fadd_rn(float __a, float __b);
__device__ float __nv_fadd_ru(float __a, float __b);
__device__ float __nv_fadd_rz(float __a, float __b);
__device__ float __nv_fast_cosf(float __a);
__device__ float __nv_fast_exp10f(float __a);
__device__ float __nv_fast_expf(float __a);
__device__ float __nv_fast_fdividef(float __a, float __b);
__device__ float __nv_fast_log10f(float __a);
__device__ float __nv_fast_log2f(float __a);
__device__ float __nv_fast_logf(float __a);
__device__ float __nv_fast_powf(float __a, float __b);
__device__ void __nv_fast_sincosf(float __a, float *__sptr, float *__cptr);
__device__ float __nv_fast_sinf(float __a);
__device__ float __nv_fast_tanf(float __a);
__device__ double __nv_fdim(double __a, double __b);
__device__ float __nv_fdimf(float __a, float __b);
__device__ float __nv_fdiv_rd(float __a, float __b);
__device__ float __nv_fdiv_rn(float __a, float __b);
__device__ float __nv_fdiv_ru(float __a, float __b);
__device__ float __nv_fdiv_rz(float __a, float __b);
__device__ int __nv_ffs(int __a);
__device__ int __nv_ffsll(long long __a);
__device__ int __nv_finitef(float __a);
__device__ unsigned short __nv_float2half_rn(float __a);
__device__ int __nv_float2int_rd(float __a);
__device__ int __nv_float2int_rn(float __a);
__device__ int __nv_float2int_ru(float __a);
__device__ int __nv_float2int_rz(float __a);
__device__ long long __nv_float2ll_rd(float __a);
__device__ long long __nv_float2ll_rn(float __a);
__device__ long long __nv_float2ll_ru(float __a);
__device__ long long __nv_float2ll_rz(float __a);
__device__ unsigned int __nv_float2uint_rd(float __a);
__device__ unsigned int __nv_float2uint_rn(float __a);
__device__ unsigned int __nv_float2uint_ru(float __a);
__device__ unsigned int __nv_float2uint_rz(float __a);
__device__ unsigned long long __nv_float2ull_rd(float __a);
__device__ unsigned long long __nv_float2ull_rn(float __a);
__device__ unsigned long long __nv_float2ull_ru(float __a);
__device__ unsigned long long __nv_float2ull_rz(float __a);
__device__ int __nv_float_as_int(float __a);
__device__ unsigned int __nv_float_as_uint(float __a);
__device__ double __nv_floor(double __a);
__device__ float __nv_floorf(float __a);
__device__ double __nv_fma(double __a, double __b, double __c);
__device__ float __nv_fmaf(float __a, float __b, float __c);
__device__ float __nv_fmaf_ieee_rd(float __a, float __b, float __c);
__device__ float __nv_fmaf_ieee_rn(float __a, float __b, float __c);
__device__ float __nv_fmaf_ieee_ru(float __a, float __b, float __c);
__device__ float __nv_fmaf_ieee_rz(float __a, float __b, float __c);
__device__ float __nv_fmaf_rd(float __a, float __b, float __c);
__device__ float __nv_fmaf_rn(float __a, float __b, float __c);
__device__ float __nv_fmaf_ru(float __a, float __b, float __c);
__device__ float __nv_fmaf_rz(float __a, float __b, float __c);
__device__ double __nv_fma_rd(double __a, double __b, double __c);
__device__ double __nv_fma_rn(double __a, double __b, double __c);
__device__ double __nv_fma_ru(double __a, double __b, double __c);
__device__ double __nv_fma_rz(double __a, double __b, double __c);
__device__ double __nv_fmax(double __a, double __b);
__device__ float __nv_fmaxf(float __a, float __b);
__device__ double __nv_fmin(double __a, double __b);
__device__ float __nv_fminf(float __a, float __b);
__device__ double __nv_fmod(double __a, double __b);
__device__ float __nv_fmodf(float __a, float __b);
__device__ float __nv_fmul_rd(float __a, float __b);
__device__ float __nv_fmul_rn(float __a, float __b);
__device__ float __nv_fmul_ru(float __a, float __b);
__device__ float __nv_fmul_rz(float __a, float __b);
__device__ float __nv_frcp_rd(float __a);
__device__ float __nv_frcp_rn(float __a);
__device__ float __nv_frcp_ru(float __a);
__device__ float __nv_frcp_rz(float __a);
__device__ double __nv_frexp(double __a, int *__b);
__device__ float __nv_frexpf(float __a, int *__b);
__device__ float __nv_frsqrt_rn(float __a);
__device__ float __nv_fsqrt_rd(float __a);
__device__ float __nv_fsqrt_rn(float __a);
__device__ float __nv_fsqrt_ru(float __a);
__device__ float __nv_fsqrt_rz(float __a);
__device__ float __nv_fsub_rd(float __a, float __b);
__device__ float __nv_fsub_rn(float __a, float __b);
__device__ float __nv_fsub_ru(float __a, float __b);
__device__ float __nv_fsub_rz(float __a, float __b);
__device__ int __nv_hadd(int __a, int __b);
__device__ float __nv_half2float(unsigned short __h);
__device__ double __nv_hiloint2double(int __a, int __b);
__device__ double __nv_hypot(double __a, double __b);
__device__ float __nv_hypotf(float __a, float __b);
__device__ int __nv_ilogb(double __a);
__device__ int __nv_ilogbf(float __a);
__device__ double __nv_int2double_rn(int __a);
__device__ float __nv_int2float_rd(int __a);
__device__ float __nv_int2float_rn(int __a);
__device__ float __nv_int2float_ru(int __a);
__device__ float __nv_int2float_rz(int __a);
__device__ float __nv_int_as_float(int __a);
__device__ int __nv_isfinited(double __a);
__device__ int __nv_isinfd(double __a);
__device__ int __nv_isinff(float __a);
__device__ int __nv_isnand(double __a);
__device__ int __nv_isnanf(float __a);
__device__ double __nv_j0(double __a);
__device__ float __nv_j0f(float __a);
__device__ double __nv_j1(double __a);
__device__ float __nv_j1f(float __a);
__device__ float __nv_jnf(int __a, float __b);
__device__ double __nv_jn(int __a, double __b);
__device__ double __nv_ldexp(double __a, int __b);
__device__ float __nv_ldexpf(float __a, int __b);
__device__ double __nv_lgamma(double __a);
__device__ float __nv_lgammaf(float __a);
__device__ double __nv_ll2double_rd(long long __a);
__device__ double __nv_ll2double_rn(long long __a);
__device__ double __nv_ll2double_ru(long long __a);
__device__ double __nv_ll2double_rz(long long __a);
__device__ float __nv_ll2float_rd(long long __a);
__device__ float __nv_ll2float_rn(long long __a);
__device__ float __nv_ll2float_ru(long long __a);
__device__ float __nv_ll2float_rz(long long __a);
__device__ long long __nv_llabs(long long __a);
__device__ long long __nv_llmax(long long __a, long long __b);
__device__ long long __nv_llmin(long long __a, long long __b);
__device__ long long __nv_llrint(double __a);
__device__ long long __nv_llrintf(float __a);
__device__ long long __nv_llround(double __a);
__device__ long long __nv_llroundf(float __a);
__device__ double __nv_log10(double __a);
__device__ float __nv_log10f(float __a);
__device__ double __nv_log1p(double __a);
__device__ float __nv_log1pf(float __a);
__device__ double __nv_log2(double __a);
__device__ float __nv_log2f(float __a);
__device__ double __nv_logb(double __a);
__device__ float __nv_logbf(float __a);
__device__ double __nv_log(double __a);
__device__ float __nv_logf(float __a);
__device__ double __nv_longlong_as_double(long long __a);
__device__ int __nv_max(int __a, int __b);
__device__ int __nv_min(int __a, int __b);
__device__ double __nv_modf(double __a, double *__b);
__device__ float __nv_modff(float __a, float *__b);
__device__ int __nv_mul24(int __a, int __b);
__device__ long long __nv_mul64hi(long long __a, long long __b);
__device__ int __nv_mulhi(int __a, int __b);
__device__ double __nv_nan(const signed char *__a);
__device__ float __nv_nanf(const signed char *__a);
__device__ double __nv_nearbyint(double __a);
__device__ float __nv_nearbyintf(float __a);
__device__ double __nv_nextafter(double __a, double __b);
__device__ float __nv_nextafterf(float __a, float __b);
__device__ double __nv_norm3d(double __a, double __b, double __c);
__device__ float __nv_norm3df(float __a, float __b, float __c);
__device__ double __nv_norm4d(double __a, double __b, double __c, double __d);
__device__ float __nv_norm4df(float __a, float __b, float __c, float __d);
__device__ double __nv_normcdf(double __a);
__device__ float __nv_normcdff(float __a);
__device__ double __nv_normcdfinv(double __a);
__device__ float __nv_normcdfinvf(float __a);
__device__ float __nv_normf(int __a, const float *__b);
__device__ double __nv_norm(int __a, const double *__b);
__device__ int __nv_popc(int __a);
__device__ int __nv_popcll(long long __a);
__device__ double __nv_pow(double __a, double __b);
__device__ float __nv_powf(float __a, float __b);
__device__ double __nv_powi(double __a, int __b);
__device__ float __nv_powif(float __a, int __b);
__device__ double __nv_rcbrt(double __a);
__device__ float __nv_rcbrtf(float __a);
__device__ double __nv_rcp64h(double __a);
__device__ double __nv_remainder(double __a, double __b);
__device__ float __nv_remainderf(float __a, float __b);
__device__ double __nv_remquo(double __a, double __b, int *__c);
__device__ float __nv_remquof(float __a, float __b, int *__c);
__device__ int __nv_rhadd(int __a, int __b);
__device__ double __nv_rhypot(double __a, double __b);
__device__ float __nv_rhypotf(float __a, float __b);
__device__ double __nv_rint(double __a);
__device__ float __nv_rintf(float __a);
__device__ double __nv_rnorm3d(double __a, double __b, double __c);
__device__ float __nv_rnorm3df(float __a, float __b, float __c);
__device__ double __nv_rnorm4d(double __a, double __b, double __c, double __d);
__device__ float __nv_rnorm4df(float __a, float __b, float __c, float __d);
__device__ float __nv_rnormf(int __a, const float *__b);
__device__ double __nv_rnorm(int __a, const double *__b);
__device__ double __nv_round(double __a);
__device__ float __nv_roundf(float __a);
__device__ double __nv_rsqrt(double __a);
__device__ float __nv_rsqrtf(float __a);
__device__ int __nv_sad(int __a, int __b, int __c);
__device__ float __nv_saturatef(float __a);
__device__ double __nv_scalbn(double __a, int __b);
__device__ float __nv_scalbnf(float __a, int __b);
__device__ int __nv_signbitd(double __a);
__device__ int __nv_signbitf(float __a);
__device__ void __nv_sincos(double __a, double *__b, double *__c);
__device__ void __nv_sincosf(float __a, float *__b, float *__c);
__device__ void __nv_sincospi(double __a, double *__b, double *__c);
__device__ void __nv_sincospif(float __a, float *__b, float *__c);
__device__ double __nv_sin(double __a);
__device__ float __nv_sinf(float __a);
__device__ double __nv_sinh(double __a);
__device__ float __nv_sinhf(float __a);
__device__ double __nv_sinpi(double __a);
__device__ float __nv_sinpif(float __a);
__device__ double __nv_sqrt(double __a);
__device__ float __nv_sqrtf(float __a);
__device__ double __nv_tan(double __a);
__device__ float __nv_tanf(float __a);
__device__ double __nv_tanh(double __a);
__device__ float __nv_tanhf(float __a);
__device__ double __nv_tgamma(double __a);
__device__ float __nv_tgammaf(float __a);
__device__ double __nv_trunc(double __a);
__device__ float __nv_truncf(float __a);
__device__ int __nv_uhadd(unsigned int __a, unsigned int __b);
__device__ double __nv_uint2double_rn(unsigned int __i);
__device__ float __nv_uint2float_rd(unsigned int __a);
__device__ float __nv_uint2float_rn(unsigned int __a);
__device__ float __nv_uint2float_ru(unsigned int __a);
__device__ float __nv_uint2float_rz(unsigned int __a);
__device__ float __nv_uint_as_float(unsigned int __a);
__device__ double __nv_ull2double_rd(unsigned long long __a);
__device__ double __nv_ull2double_rn(unsigned long long __a);
__device__ double __nv_ull2double_ru(unsigned long long __a);
__device__ double __nv_ull2double_rz(unsigned long long __a);
__device__ float __nv_ull2float_rd(unsigned long long __a);
__device__ float __nv_ull2float_rn(unsigned long long __a);
__device__ float __nv_ull2float_ru(unsigned long long __a);
__device__ float __nv_ull2float_rz(unsigned long long __a);
__device__ unsigned long long __nv_ullmax(unsigned long long __a,
#if defined(_OPENMP)
#define __DEVICE__
#elif defined(__CUDA__)
#define __DEVICE__ __device__
#endif
__DEVICE__ int __nv_abs(int __a);
__DEVICE__ double __nv_acos(double __a);
__DEVICE__ float __nv_acosf(float __a);
__DEVICE__ double __nv_acosh(double __a);
__DEVICE__ float __nv_acoshf(float __a);
__DEVICE__ double __nv_asin(double __a);
__DEVICE__ float __nv_asinf(float __a);
__DEVICE__ double __nv_asinh(double __a);
__DEVICE__ float __nv_asinhf(float __a);
__DEVICE__ double __nv_atan2(double __a, double __b);
__DEVICE__ float __nv_atan2f(float __a, float __b);
__DEVICE__ double __nv_atan(double __a);
__DEVICE__ float __nv_atanf(float __a);
__DEVICE__ double __nv_atanh(double __a);
__DEVICE__ float __nv_atanhf(float __a);
__DEVICE__ int __nv_brev(int __a);
__DEVICE__ long long __nv_brevll(long long __a);
__DEVICE__ int __nv_byte_perm(int __a, int __b, int __c);
__DEVICE__ double __nv_cbrt(double __a);
__DEVICE__ float __nv_cbrtf(float __a);
__DEVICE__ double __nv_ceil(double __a);
__DEVICE__ float __nv_ceilf(float __a);
__DEVICE__ int __nv_clz(int __a);
__DEVICE__ int __nv_clzll(long long __a);
__DEVICE__ double __nv_copysign(double __a, double __b);
__DEVICE__ float __nv_copysignf(float __a, float __b);
__DEVICE__ double __nv_cos(double __a);
__DEVICE__ float __nv_cosf(float __a);
__DEVICE__ double __nv_cosh(double __a);
__DEVICE__ float __nv_coshf(float __a);
__DEVICE__ double __nv_cospi(double __a);
__DEVICE__ float __nv_cospif(float __a);
__DEVICE__ double __nv_cyl_bessel_i0(double __a);
__DEVICE__ float __nv_cyl_bessel_i0f(float __a);
__DEVICE__ double __nv_cyl_bessel_i1(double __a);
__DEVICE__ float __nv_cyl_bessel_i1f(float __a);
__DEVICE__ double __nv_dadd_rd(double __a, double __b);
__DEVICE__ double __nv_dadd_rn(double __a, double __b);
__DEVICE__ double __nv_dadd_ru(double __a, double __b);
__DEVICE__ double __nv_dadd_rz(double __a, double __b);
__DEVICE__ double __nv_ddiv_rd(double __a, double __b);
__DEVICE__ double __nv_ddiv_rn(double __a, double __b);
__DEVICE__ double __nv_ddiv_ru(double __a, double __b);
__DEVICE__ double __nv_ddiv_rz(double __a, double __b);
__DEVICE__ double __nv_dmul_rd(double __a, double __b);
__DEVICE__ double __nv_dmul_rn(double __a, double __b);
__DEVICE__ double __nv_dmul_ru(double __a, double __b);
__DEVICE__ double __nv_dmul_rz(double __a, double __b);
__DEVICE__ float __nv_double2float_rd(double __a);
__DEVICE__ float __nv_double2float_rn(double __a);
__DEVICE__ float __nv_double2float_ru(double __a);
__DEVICE__ float __nv_double2float_rz(double __a);
__DEVICE__ int __nv_double2hiint(double __a);
__DEVICE__ int __nv_double2int_rd(double __a);
__DEVICE__ int __nv_double2int_rn(double __a);
__DEVICE__ int __nv_double2int_ru(double __a);
__DEVICE__ int __nv_double2int_rz(double __a);
__DEVICE__ long long __nv_double2ll_rd(double __a);
__DEVICE__ long long __nv_double2ll_rn(double __a);
__DEVICE__ long long __nv_double2ll_ru(double __a);
__DEVICE__ long long __nv_double2ll_rz(double __a);
__DEVICE__ int __nv_double2loint(double __a);
__DEVICE__ unsigned int __nv_double2uint_rd(double __a);
__DEVICE__ unsigned int __nv_double2uint_rn(double __a);
__DEVICE__ unsigned int __nv_double2uint_ru(double __a);
__DEVICE__ unsigned int __nv_double2uint_rz(double __a);
__DEVICE__ unsigned long long __nv_double2ull_rd(double __a);
__DEVICE__ unsigned long long __nv_double2ull_rn(double __a);
__DEVICE__ unsigned long long __nv_double2ull_ru(double __a);
__DEVICE__ unsigned long long __nv_double2ull_rz(double __a);
__DEVICE__ unsigned long long __nv_double_as_longlong(double __a);
__DEVICE__ double __nv_drcp_rd(double __a);
__DEVICE__ double __nv_drcp_rn(double __a);
__DEVICE__ double __nv_drcp_ru(double __a);
__DEVICE__ double __nv_drcp_rz(double __a);
__DEVICE__ double __nv_dsqrt_rd(double __a);
__DEVICE__ double __nv_dsqrt_rn(double __a);
__DEVICE__ double __nv_dsqrt_ru(double __a);
__DEVICE__ double __nv_dsqrt_rz(double __a);
__DEVICE__ double __nv_dsub_rd(double __a, double __b);
__DEVICE__ double __nv_dsub_rn(double __a, double __b);
__DEVICE__ double __nv_dsub_ru(double __a, double __b);
__DEVICE__ double __nv_dsub_rz(double __a, double __b);
__DEVICE__ double __nv_erfc(double __a);
__DEVICE__ float __nv_erfcf(float __a);
__DEVICE__ double __nv_erfcinv(double __a);
__DEVICE__ float __nv_erfcinvf(float __a);
__DEVICE__ double __nv_erfcx(double __a);
__DEVICE__ float __nv_erfcxf(float __a);
__DEVICE__ double __nv_erf(double __a);
__DEVICE__ float __nv_erff(float __a);
__DEVICE__ double __nv_erfinv(double __a);
__DEVICE__ float __nv_erfinvf(float __a);
__DEVICE__ double __nv_exp10(double __a);
__DEVICE__ float __nv_exp10f(float __a);
__DEVICE__ double __nv_exp2(double __a);
__DEVICE__ float __nv_exp2f(float __a);
__DEVICE__ double __nv_exp(double __a);
__DEVICE__ float __nv_expf(float __a);
__DEVICE__ double __nv_expm1(double __a);
__DEVICE__ float __nv_expm1f(float __a);
__DEVICE__ double __nv_fabs(double __a);
__DEVICE__ float __nv_fabsf(float __a);
__DEVICE__ float __nv_fadd_rd(float __a, float __b);
__DEVICE__ float __nv_fadd_rn(float __a, float __b);
__DEVICE__ float __nv_fadd_ru(float __a, float __b);
__DEVICE__ float __nv_fadd_rz(float __a, float __b);
__DEVICE__ float __nv_fast_cosf(float __a);
__DEVICE__ float __nv_fast_exp10f(float __a);
__DEVICE__ float __nv_fast_expf(float __a);
__DEVICE__ float __nv_fast_fdividef(float __a, float __b);
__DEVICE__ float __nv_fast_log10f(float __a);
__DEVICE__ float __nv_fast_log2f(float __a);
__DEVICE__ float __nv_fast_logf(float __a);
__DEVICE__ float __nv_fast_powf(float __a, float __b);
__DEVICE__ void __nv_fast_sincosf(float __a, float *__s, float *__c);
__DEVICE__ float __nv_fast_sinf(float __a);
__DEVICE__ float __nv_fast_tanf(float __a);
__DEVICE__ double __nv_fdim(double __a, double __b);
__DEVICE__ float __nv_fdimf(float __a, float __b);
__DEVICE__ float __nv_fdiv_rd(float __a, float __b);
__DEVICE__ float __nv_fdiv_rn(float __a, float __b);
__DEVICE__ float __nv_fdiv_ru(float __a, float __b);
__DEVICE__ float __nv_fdiv_rz(float __a, float __b);
__DEVICE__ int __nv_ffs(int __a);
__DEVICE__ int __nv_ffsll(long long __a);
__DEVICE__ int __nv_finitef(float __a);
__DEVICE__ unsigned short __nv_float2half_rn(float __a);
__DEVICE__ int __nv_float2int_rd(float __a);
__DEVICE__ int __nv_float2int_rn(float __a);
__DEVICE__ int __nv_float2int_ru(float __a);
__DEVICE__ int __nv_float2int_rz(float __a);
__DEVICE__ long long __nv_float2ll_rd(float __a);
__DEVICE__ long long __nv_float2ll_rn(float __a);
__DEVICE__ long long __nv_float2ll_ru(float __a);
__DEVICE__ long long __nv_float2ll_rz(float __a);
__DEVICE__ unsigned int __nv_float2uint_rd(float __a);
__DEVICE__ unsigned int __nv_float2uint_rn(float __a);
__DEVICE__ unsigned int __nv_float2uint_ru(float __a);
__DEVICE__ unsigned int __nv_float2uint_rz(float __a);
__DEVICE__ unsigned long long __nv_float2ull_rd(float __a);
__DEVICE__ unsigned long long __nv_float2ull_rn(float __a);
__DEVICE__ unsigned long long __nv_float2ull_ru(float __a);
__DEVICE__ unsigned long long __nv_float2ull_rz(float __a);
__DEVICE__ int __nv_float_as_int(float __a);
__DEVICE__ unsigned int __nv_float_as_uint(float __a);
__DEVICE__ double __nv_floor(double __a);
__DEVICE__ float __nv_floorf(float __a);
__DEVICE__ double __nv_fma(double __a, double __b, double __c);
__DEVICE__ float __nv_fmaf(float __a, float __b, float __c);
__DEVICE__ float __nv_fmaf_ieee_rd(float __a, float __b, float __c);
__DEVICE__ float __nv_fmaf_ieee_rn(float __a, float __b, float __c);
__DEVICE__ float __nv_fmaf_ieee_ru(float __a, float __b, float __c);
__DEVICE__ float __nv_fmaf_ieee_rz(float __a, float __b, float __c);
__DEVICE__ float __nv_fmaf_rd(float __a, float __b, float __c);
__DEVICE__ float __nv_fmaf_rn(float __a, float __b, float __c);
__DEVICE__ float __nv_fmaf_ru(float __a, float __b, float __c);
__DEVICE__ float __nv_fmaf_rz(float __a, float __b, float __c);
__DEVICE__ double __nv_fma_rd(double __a, double __b, double __c);
__DEVICE__ double __nv_fma_rn(double __a, double __b, double __c);
__DEVICE__ double __nv_fma_ru(double __a, double __b, double __c);
__DEVICE__ double __nv_fma_rz(double __a, double __b, double __c);
__DEVICE__ double __nv_fmax(double __a, double __b);
__DEVICE__ float __nv_fmaxf(float __a, float __b);
__DEVICE__ double __nv_fmin(double __a, double __b);
__DEVICE__ float __nv_fminf(float __a, float __b);
__DEVICE__ double __nv_fmod(double __a, double __b);
__DEVICE__ float __nv_fmodf(float __a, float __b);
__DEVICE__ float __nv_fmul_rd(float __a, float __b);
__DEVICE__ float __nv_fmul_rn(float __a, float __b);
__DEVICE__ float __nv_fmul_ru(float __a, float __b);
__DEVICE__ float __nv_fmul_rz(float __a, float __b);
__DEVICE__ float __nv_frcp_rd(float __a);
__DEVICE__ float __nv_frcp_rn(float __a);
__DEVICE__ float __nv_frcp_ru(float __a);
__DEVICE__ float __nv_frcp_rz(float __a);
__DEVICE__ double __nv_frexp(double __a, int *__b);
__DEVICE__ float __nv_frexpf(float __a, int *__b);
__DEVICE__ float __nv_frsqrt_rn(float __a);
__DEVICE__ float __nv_fsqrt_rd(float __a);
__DEVICE__ float __nv_fsqrt_rn(float __a);
__DEVICE__ float __nv_fsqrt_ru(float __a);
__DEVICE__ float __nv_fsqrt_rz(float __a);
__DEVICE__ float __nv_fsub_rd(float __a, float __b);
__DEVICE__ float __nv_fsub_rn(float __a, float __b);
__DEVICE__ float __nv_fsub_ru(float __a, float __b);
__DEVICE__ float __nv_fsub_rz(float __a, float __b);
__DEVICE__ int __nv_hadd(int __a, int __b);
__DEVICE__ float __nv_half2float(unsigned short __h);
__DEVICE__ double __nv_hiloint2double(int __a, int __b);
__DEVICE__ double __nv_hypot(double __a, double __b);
__DEVICE__ float __nv_hypotf(float __a, float __b);
__DEVICE__ int __nv_ilogb(double __a);
__DEVICE__ int __nv_ilogbf(float __a);
__DEVICE__ double __nv_int2double_rn(int __a);
__DEVICE__ float __nv_int2float_rd(int __a);
__DEVICE__ float __nv_int2float_rn(int __a);
__DEVICE__ float __nv_int2float_ru(int __a);
__DEVICE__ float __nv_int2float_rz(int __a);
__DEVICE__ float __nv_int_as_float(int __a);
__DEVICE__ int __nv_isfinited(double __a);
__DEVICE__ int __nv_isinfd(double __a);
__DEVICE__ int __nv_isinff(float __a);
__DEVICE__ int __nv_isnand(double __a);
__DEVICE__ int __nv_isnanf(float __a);
__DEVICE__ double __nv_j0(double __a);
__DEVICE__ float __nv_j0f(float __a);
__DEVICE__ double __nv_j1(double __a);
__DEVICE__ float __nv_j1f(float __a);
__DEVICE__ float __nv_jnf(int __a, float __b);
__DEVICE__ double __nv_jn(int __a, double __b);
__DEVICE__ double __nv_ldexp(double __a, int __b);
__DEVICE__ float __nv_ldexpf(float __a, int __b);
__DEVICE__ double __nv_lgamma(double __a);
__DEVICE__ float __nv_lgammaf(float __a);
__DEVICE__ double __nv_ll2double_rd(long long __a);
__DEVICE__ double __nv_ll2double_rn(long long __a);
__DEVICE__ double __nv_ll2double_ru(long long __a);
__DEVICE__ double __nv_ll2double_rz(long long __a);
__DEVICE__ float __nv_ll2float_rd(long long __a);
__DEVICE__ float __nv_ll2float_rn(long long __a);
__DEVICE__ float __nv_ll2float_ru(long long __a);
__DEVICE__ float __nv_ll2float_rz(long long __a);
__DEVICE__ long long __nv_llabs(long long __a);
__DEVICE__ long long __nv_llmax(long long __a, long long __b);
__DEVICE__ long long __nv_llmin(long long __a, long long __b);
__DEVICE__ long long __nv_llrint(double __a);
__DEVICE__ long long __nv_llrintf(float __a);
__DEVICE__ long long __nv_llround(double __a);
__DEVICE__ long long __nv_llroundf(float __a);
__DEVICE__ double __nv_log10(double __a);
__DEVICE__ float __nv_log10f(float __a);
__DEVICE__ double __nv_log1p(double __a);
__DEVICE__ float __nv_log1pf(float __a);
__DEVICE__ double __nv_log2(double __a);
__DEVICE__ float __nv_log2f(float __a);
__DEVICE__ double __nv_logb(double __a);
__DEVICE__ float __nv_logbf(float __a);
__DEVICE__ double __nv_log(double __a);
__DEVICE__ float __nv_logf(float __a);
__DEVICE__ double __nv_longlong_as_double(long long __a);
__DEVICE__ int __nv_max(int __a, int __b);
__DEVICE__ int __nv_min(int __a, int __b);
__DEVICE__ double __nv_modf(double __a, double *__b);
__DEVICE__ float __nv_modff(float __a, float *__b);
__DEVICE__ int __nv_mul24(int __a, int __b);
__DEVICE__ long long __nv_mul64hi(long long __a, long long __b);
__DEVICE__ int __nv_mulhi(int __a, int __b);
__DEVICE__ double __nv_nan(const signed char *__a);
__DEVICE__ float __nv_nanf(const signed char *__a);
__DEVICE__ double __nv_nearbyint(double __a);
__DEVICE__ float __nv_nearbyintf(float __a);
__DEVICE__ double __nv_nextafter(double __a, double __b);
__DEVICE__ float __nv_nextafterf(float __a, float __b);
__DEVICE__ double __nv_norm3d(double __a, double __b, double __c);
__DEVICE__ float __nv_norm3df(float __a, float __b, float __c);
__DEVICE__ double __nv_norm4d(double __a, double __b, double __c, double __d);
__DEVICE__ float __nv_norm4df(float __a, float __b, float __c, float __d);
__DEVICE__ double __nv_normcdf(double __a);
__DEVICE__ float __nv_normcdff(float __a);
__DEVICE__ double __nv_normcdfinv(double __a);
__DEVICE__ float __nv_normcdfinvf(float __a);
__DEVICE__ float __nv_normf(int __a, const float *__b);
__DEVICE__ double __nv_norm(int __a, const double *__b);
__DEVICE__ int __nv_popc(int __a);
__DEVICE__ int __nv_popcll(long long __a);
__DEVICE__ double __nv_pow(double __a, double __b);
__DEVICE__ float __nv_powf(float __a, float __b);
__DEVICE__ double __nv_powi(double __a, int __b);
__DEVICE__ float __nv_powif(float __a, int __b);
__DEVICE__ double __nv_rcbrt(double __a);
__DEVICE__ float __nv_rcbrtf(float __a);
__DEVICE__ double __nv_rcp64h(double __a);
__DEVICE__ double __nv_remainder(double __a, double __b);
__DEVICE__ float __nv_remainderf(float __a, float __b);
__DEVICE__ double __nv_remquo(double __a, double __b, int *__c);
__DEVICE__ float __nv_remquof(float __a, float __b, int *__c);
__DEVICE__ int __nv_rhadd(int __a, int __b);
__DEVICE__ double __nv_rhypot(double __a, double __b);
__DEVICE__ float __nv_rhypotf(float __a, float __b);
__DEVICE__ double __nv_rint(double __a);
__DEVICE__ float __nv_rintf(float __a);
__DEVICE__ double __nv_rnorm3d(double __a, double __b, double __c);
__DEVICE__ float __nv_rnorm3df(float __a, float __b, float __c);
__DEVICE__ double __nv_rnorm4d(double __a, double __b, double __c, double __d);
__DEVICE__ float __nv_rnorm4df(float __a, float __b, float __c, float __d);
__DEVICE__ float __nv_rnormf(int __a, const float *__b);
__DEVICE__ double __nv_rnorm(int __a, const double *__b);
__DEVICE__ double __nv_round(double __a);
__DEVICE__ float __nv_roundf(float __a);
__DEVICE__ double __nv_rsqrt(double __a);
__DEVICE__ float __nv_rsqrtf(float __a);
__DEVICE__ int __nv_sad(int __a, int __b, int __c);
__DEVICE__ float __nv_saturatef(float __a);
__DEVICE__ double __nv_scalbn(double __a, int __b);
__DEVICE__ float __nv_scalbnf(float __a, int __b);
__DEVICE__ int __nv_signbitd(double __a);
__DEVICE__ int __nv_signbitf(float __a);
__DEVICE__ void __nv_sincos(double __a, double *__b, double *__c);
__DEVICE__ void __nv_sincosf(float __a, float *__b, float *__c);
__DEVICE__ void __nv_sincospi(double __a, double *__b, double *__c);
__DEVICE__ void __nv_sincospif(float __a, float *__b, float *__c);
__DEVICE__ double __nv_sin(double __a);
__DEVICE__ float __nv_sinf(float __a);
__DEVICE__ double __nv_sinh(double __a);
__DEVICE__ float __nv_sinhf(float __a);
__DEVICE__ double __nv_sinpi(double __a);
__DEVICE__ float __nv_sinpif(float __a);
__DEVICE__ double __nv_sqrt(double __a);
__DEVICE__ float __nv_sqrtf(float __a);
__DEVICE__ double __nv_tan(double __a);
__DEVICE__ float __nv_tanf(float __a);
__DEVICE__ double __nv_tanh(double __a);
__DEVICE__ float __nv_tanhf(float __a);
__DEVICE__ double __nv_tgamma(double __a);
__DEVICE__ float __nv_tgammaf(float __a);
__DEVICE__ double __nv_trunc(double __a);
__DEVICE__ float __nv_truncf(float __a);
__DEVICE__ int __nv_uhadd(unsigned int __a, unsigned int __b);
__DEVICE__ double __nv_uint2double_rn(unsigned int __i);
__DEVICE__ float __nv_uint2float_rd(unsigned int __a);
__DEVICE__ float __nv_uint2float_rn(unsigned int __a);
__DEVICE__ float __nv_uint2float_ru(unsigned int __a);
__DEVICE__ float __nv_uint2float_rz(unsigned int __a);
__DEVICE__ float __nv_uint_as_float(unsigned int __a);
__DEVICE__ double __nv_ull2double_rd(unsigned long long __a);
__DEVICE__ double __nv_ull2double_rn(unsigned long long __a);
__DEVICE__ double __nv_ull2double_ru(unsigned long long __a);
__DEVICE__ double __nv_ull2double_rz(unsigned long long __a);
__DEVICE__ float __nv_ull2float_rd(unsigned long long __a);
__DEVICE__ float __nv_ull2float_rn(unsigned long long __a);
__DEVICE__ float __nv_ull2float_ru(unsigned long long __a);
__DEVICE__ float __nv_ull2float_rz(unsigned long long __a);
__DEVICE__ unsigned long long __nv_ullmax(unsigned long long __a,
unsigned long long __b);
__device__ unsigned long long __nv_ullmin(unsigned long long __a,
__DEVICE__ unsigned long long __nv_ullmin(unsigned long long __a,
unsigned long long __b);
__device__ unsigned int __nv_umax(unsigned int __a, unsigned int __b);
__device__ unsigned int __nv_umin(unsigned int __a, unsigned int __b);
__device__ unsigned int __nv_umul24(unsigned int __a, unsigned int __b);
__device__ unsigned long long __nv_umul64hi(unsigned long long __a,
__DEVICE__ unsigned int __nv_umax(unsigned int __a, unsigned int __b);
__DEVICE__ unsigned int __nv_umin(unsigned int __a, unsigned int __b);
__DEVICE__ unsigned int __nv_umul24(unsigned int __a, unsigned int __b);
__DEVICE__ unsigned long long __nv_umul64hi(unsigned long long __a,
unsigned long long __b);
__device__ unsigned int __nv_umulhi(unsigned int __a, unsigned int __b);
__device__ unsigned int __nv_urhadd(unsigned int __a, unsigned int __b);
__device__ unsigned int __nv_usad(unsigned int __a, unsigned int __b,
__DEVICE__ unsigned int __nv_umulhi(unsigned int __a, unsigned int __b);
__DEVICE__ unsigned int __nv_urhadd(unsigned int __a, unsigned int __b);
__DEVICE__ unsigned int __nv_usad(unsigned int __a, unsigned int __b,
unsigned int __c);
#if CUDA_VERSION >= 9000 && CUDA_VERSION < 9020
__device__ int __nv_vabs2(int __a);
__device__ int __nv_vabs4(int __a);
__device__ int __nv_vabsdiffs2(int __a, int __b);
__device__ int __nv_vabsdiffs4(int __a, int __b);
__device__ int __nv_vabsdiffu2(int __a, int __b);
__device__ int __nv_vabsdiffu4(int __a, int __b);
__device__ int __nv_vabsss2(int __a);
__device__ int __nv_vabsss4(int __a);
__device__ int __nv_vadd2(int __a, int __b);
__device__ int __nv_vadd4(int __a, int __b);
__device__ int __nv_vaddss2(int __a, int __b);
__device__ int __nv_vaddss4(int __a, int __b);
__device__ int __nv_vaddus2(int __a, int __b);
__device__ int __nv_vaddus4(int __a, int __b);
__device__ int __nv_vavgs2(int __a, int __b);
__device__ int __nv_vavgs4(int __a, int __b);
__device__ int __nv_vavgu2(int __a, int __b);
__device__ int __nv_vavgu4(int __a, int __b);
__device__ int __nv_vcmpeq2(int __a, int __b);
__device__ int __nv_vcmpeq4(int __a, int __b);
__device__ int __nv_vcmpges2(int __a, int __b);
__device__ int __nv_vcmpges4(int __a, int __b);
__device__ int __nv_vcmpgeu2(int __a, int __b);
__device__ int __nv_vcmpgeu4(int __a, int __b);
__device__ int __nv_vcmpgts2(int __a, int __b);
__device__ int __nv_vcmpgts4(int __a, int __b);
__device__ int __nv_vcmpgtu2(int __a, int __b);
__device__ int __nv_vcmpgtu4(int __a, int __b);
__device__ int __nv_vcmples2(int __a, int __b);
__device__ int __nv_vcmples4(int __a, int __b);
__device__ int __nv_vcmpleu2(int __a, int __b);
__device__ int __nv_vcmpleu4(int __a, int __b);
__device__ int __nv_vcmplts2(int __a, int __b);
__device__ int __nv_vcmplts4(int __a, int __b);
__device__ int __nv_vcmpltu2(int __a, int __b);
__device__ int __nv_vcmpltu4(int __a, int __b);
__device__ int __nv_vcmpne2(int __a, int __b);
__device__ int __nv_vcmpne4(int __a, int __b);
__device__ int __nv_vhaddu2(int __a, int __b);
__device__ int __nv_vhaddu4(int __a, int __b);
__device__ int __nv_vmaxs2(int __a, int __b);
__device__ int __nv_vmaxs4(int __a, int __b);
__device__ int __nv_vmaxu2(int __a, int __b);
__device__ int __nv_vmaxu4(int __a, int __b);
__device__ int __nv_vmins2(int __a, int __b);
__device__ int __nv_vmins4(int __a, int __b);
__device__ int __nv_vminu2(int __a, int __b);
__device__ int __nv_vminu4(int __a, int __b);
__device__ int __nv_vneg2(int __a);
__device__ int __nv_vneg4(int __a);
__device__ int __nv_vnegss2(int __a);
__device__ int __nv_vnegss4(int __a);
__device__ int __nv_vsads2(int __a, int __b);
__device__ int __nv_vsads4(int __a, int __b);
__device__ int __nv_vsadu2(int __a, int __b);
__device__ int __nv_vsadu4(int __a, int __b);
__device__ int __nv_vseteq2(int __a, int __b);
__device__ int __nv_vseteq4(int __a, int __b);
__device__ int __nv_vsetges2(int __a, int __b);
__device__ int __nv_vsetges4(int __a, int __b);
__device__ int __nv_vsetgeu2(int __a, int __b);
__device__ int __nv_vsetgeu4(int __a, int __b);
__device__ int __nv_vsetgts2(int __a, int __b);
__device__ int __nv_vsetgts4(int __a, int __b);
__device__ int __nv_vsetgtu2(int __a, int __b);
__device__ int __nv_vsetgtu4(int __a, int __b);
__device__ int __nv_vsetles2(int __a, int __b);
__device__ int __nv_vsetles4(int __a, int __b);
__device__ int __nv_vsetleu2(int __a, int __b);
__device__ int __nv_vsetleu4(int __a, int __b);
__device__ int __nv_vsetlts2(int __a, int __b);
__device__ int __nv_vsetlts4(int __a, int __b);
__device__ int __nv_vsetltu2(int __a, int __b);
__device__ int __nv_vsetltu4(int __a, int __b);
__device__ int __nv_vsetne2(int __a, int __b);
__device__ int __nv_vsetne4(int __a, int __b);
__device__ int __nv_vsub2(int __a, int __b);
__device__ int __nv_vsub4(int __a, int __b);
__device__ int __nv_vsubss2(int __a, int __b);
__device__ int __nv_vsubss4(int __a, int __b);
__device__ int __nv_vsubus2(int __a, int __b);
__device__ int __nv_vsubus4(int __a, int __b);
__DEVICE__ int __nv_vabs2(int __a);
__DEVICE__ int __nv_vabs4(int __a);
__DEVICE__ int __nv_vabsdiffs2(int __a, int __b);
__DEVICE__ int __nv_vabsdiffs4(int __a, int __b);
__DEVICE__ int __nv_vabsdiffu2(int __a, int __b);
__DEVICE__ int __nv_vabsdiffu4(int __a, int __b);
__DEVICE__ int __nv_vabsss2(int __a);
__DEVICE__ int __nv_vabsss4(int __a);
__DEVICE__ int __nv_vadd2(int __a, int __b);
__DEVICE__ int __nv_vadd4(int __a, int __b);
__DEVICE__ int __nv_vaddss2(int __a, int __b);
__DEVICE__ int __nv_vaddss4(int __a, int __b);
__DEVICE__ int __nv_vaddus2(int __a, int __b);
__DEVICE__ int __nv_vaddus4(int __a, int __b);
__DEVICE__ int __nv_vavgs2(int __a, int __b);
__DEVICE__ int __nv_vavgs4(int __a, int __b);
__DEVICE__ int __nv_vavgu2(int __a, int __b);
__DEVICE__ int __nv_vavgu4(int __a, int __b);
__DEVICE__ int __nv_vcmpeq2(int __a, int __b);
__DEVICE__ int __nv_vcmpeq4(int __a, int __b);
__DEVICE__ int __nv_vcmpges2(int __a, int __b);
__DEVICE__ int __nv_vcmpges4(int __a, int __b);
__DEVICE__ int __nv_vcmpgeu2(int __a, int __b);
__DEVICE__ int __nv_vcmpgeu4(int __a, int __b);
__DEVICE__ int __nv_vcmpgts2(int __a, int __b);
__DEVICE__ int __nv_vcmpgts4(int __a, int __b);
__DEVICE__ int __nv_vcmpgtu2(int __a, int __b);
__DEVICE__ int __nv_vcmpgtu4(int __a, int __b);
__DEVICE__ int __nv_vcmples2(int __a, int __b);
__DEVICE__ int __nv_vcmples4(int __a, int __b);
__DEVICE__ int __nv_vcmpleu2(int __a, int __b);
__DEVICE__ int __nv_vcmpleu4(int __a, int __b);
__DEVICE__ int __nv_vcmplts2(int __a, int __b);
__DEVICE__ int __nv_vcmplts4(int __a, int __b);
__DEVICE__ int __nv_vcmpltu2(int __a, int __b);
__DEVICE__ int __nv_vcmpltu4(int __a, int __b);
__DEVICE__ int __nv_vcmpne2(int __a, int __b);
__DEVICE__ int __nv_vcmpne4(int __a, int __b);
__DEVICE__ int __nv_vhaddu2(int __a, int __b);
__DEVICE__ int __nv_vhaddu4(int __a, int __b);
__DEVICE__ int __nv_vmaxs2(int __a, int __b);
__DEVICE__ int __nv_vmaxs4(int __a, int __b);
__DEVICE__ int __nv_vmaxu2(int __a, int __b);
__DEVICE__ int __nv_vmaxu4(int __a, int __b);
__DEVICE__ int __nv_vmins2(int __a, int __b);
__DEVICE__ int __nv_vmins4(int __a, int __b);
__DEVICE__ int __nv_vminu2(int __a, int __b);
__DEVICE__ int __nv_vminu4(int __a, int __b);
__DEVICE__ int __nv_vneg2(int __a);
__DEVICE__ int __nv_vneg4(int __a);
__DEVICE__ int __nv_vnegss2(int __a);
__DEVICE__ int __nv_vnegss4(int __a);
__DEVICE__ int __nv_vsads2(int __a, int __b);
__DEVICE__ int __nv_vsads4(int __a, int __b);
__DEVICE__ int __nv_vsadu2(int __a, int __b);
__DEVICE__ int __nv_vsadu4(int __a, int __b);
__DEVICE__ int __nv_vseteq2(int __a, int __b);
__DEVICE__ int __nv_vseteq4(int __a, int __b);
__DEVICE__ int __nv_vsetges2(int __a, int __b);
__DEVICE__ int __nv_vsetges4(int __a, int __b);
__DEVICE__ int __nv_vsetgeu2(int __a, int __b);
__DEVICE__ int __nv_vsetgeu4(int __a, int __b);
__DEVICE__ int __nv_vsetgts2(int __a, int __b);
__DEVICE__ int __nv_vsetgts4(int __a, int __b);
__DEVICE__ int __nv_vsetgtu2(int __a, int __b);
__DEVICE__ int __nv_vsetgtu4(int __a, int __b);
__DEVICE__ int __nv_vsetles2(int __a, int __b);
__DEVICE__ int __nv_vsetles4(int __a, int __b);
__DEVICE__ int __nv_vsetleu2(int __a, int __b);
__DEVICE__ int __nv_vsetleu4(int __a, int __b);
__DEVICE__ int __nv_vsetlts2(int __a, int __b);
__DEVICE__ int __nv_vsetlts4(int __a, int __b);
__DEVICE__ int __nv_vsetltu2(int __a, int __b);
__DEVICE__ int __nv_vsetltu4(int __a, int __b);
__DEVICE__ int __nv_vsetne2(int __a, int __b);
__DEVICE__ int __nv_vsetne4(int __a, int __b);
__DEVICE__ int __nv_vsub2(int __a, int __b);
__DEVICE__ int __nv_vsub4(int __a, int __b);
__DEVICE__ int __nv_vsubss2(int __a, int __b);
__DEVICE__ int __nv_vsubss4(int __a, int __b);
__DEVICE__ int __nv_vsubus2(int __a, int __b);
__DEVICE__ int __nv_vsubus4(int __a, int __b);
#endif // CUDA_VERSION
__device__ double __nv_y0(double __a);
__device__ float __nv_y0f(float __a);
__device__ double __nv_y1(double __a);
__device__ float __nv_y1f(float __a);
__device__ float __nv_ynf(int __a, float __b);
__device__ double __nv_yn(int __a, double __b);
__DEVICE__ double __nv_y0(double __a);
__DEVICE__ float __nv_y0f(float __a);
__DEVICE__ double __nv_y1(double __a);
__DEVICE__ float __nv_y1f(float __a);
__DEVICE__ float __nv_ynf(int __a, float __b);
__DEVICE__ double __nv_yn(int __a, double __b);
#if defined(__cplusplus)
} // extern "C"
#endif
#endif // __CLANG_CUDA_LIBDEVICE_DECLARES_H__
+46 -24
View File
@@ -1,22 +1,8 @@
/*===- __clang_math_forward_declares.h - Prototypes of __device__ math fns --===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
@@ -34,14 +20,37 @@
// would preclude the use of our own __device__ overloads for these functions.
#pragma push_macro("__DEVICE__")
#ifdef _OPENMP
#define __DEVICE__ static __inline__ __attribute__((always_inline))
#else
#define __DEVICE__ \
static __inline__ __attribute__((always_inline)) __attribute__((device))
#endif
__DEVICE__ double abs(double);
__DEVICE__ float abs(float);
__DEVICE__ int abs(int);
// For C++ 17 we need to include noexcept attribute to be compatible
// with the header-defined version. This may be removed once
// variant is supported.
#if defined(_OPENMP) && defined(__cplusplus) && __cplusplus >= 201703L
#define __NOEXCEPT noexcept
#else
#define __NOEXCEPT
#endif
#if !(defined(_OPENMP) && defined(__cplusplus))
__DEVICE__ long abs(long);
__DEVICE__ long long abs(long long);
__DEVICE__ double abs(double);
__DEVICE__ float abs(float);
#endif
// While providing the CUDA declarations and definitions for math functions,
// we may manually define additional functions.
// TODO: Once variant is supported the additional functions will have
// to be removed.
#if defined(_OPENMP) && defined(__cplusplus)
__DEVICE__ const double abs(const double);
__DEVICE__ const float abs(const float);
#endif
__DEVICE__ int abs(int) __NOEXCEPT;
__DEVICE__ double acos(double);
__DEVICE__ float acos(float);
__DEVICE__ double acosh(double);
@@ -76,8 +85,8 @@ __DEVICE__ double exp(double);
__DEVICE__ float exp(float);
__DEVICE__ double expm1(double);
__DEVICE__ float expm1(float);
__DEVICE__ double fabs(double);
__DEVICE__ float fabs(float);
__DEVICE__ double fabs(double) __NOEXCEPT;
__DEVICE__ float fabs(float) __NOEXCEPT;
__DEVICE__ double fdim(double, double);
__DEVICE__ float fdim(float, float);
__DEVICE__ double floor(double);
@@ -98,12 +107,18 @@ __DEVICE__ double hypot(double, double);
__DEVICE__ float hypot(float, float);
__DEVICE__ int ilogb(double);
__DEVICE__ int ilogb(float);
#ifdef _MSC_VER
__DEVICE__ bool isfinite(long double);
#endif
__DEVICE__ bool isfinite(double);
__DEVICE__ bool isfinite(float);
__DEVICE__ bool isgreater(double, double);
__DEVICE__ bool isgreaterequal(double, double);
__DEVICE__ bool isgreaterequal(float, float);
__DEVICE__ bool isgreater(float, float);
#ifdef _MSC_VER
__DEVICE__ bool isinf(long double);
#endif
__DEVICE__ bool isinf(double);
__DEVICE__ bool isinf(float);
__DEVICE__ bool isless(double, double);
@@ -112,18 +127,21 @@ __DEVICE__ bool islessequal(float, float);
__DEVICE__ bool isless(float, float);
__DEVICE__ bool islessgreater(double, double);
__DEVICE__ bool islessgreater(float, float);
#ifdef _MSC_VER
__DEVICE__ bool isnan(long double);
#endif
__DEVICE__ bool isnan(double);
__DEVICE__ bool isnan(float);
__DEVICE__ bool isnormal(double);
__DEVICE__ bool isnormal(float);
__DEVICE__ bool isunordered(double, double);
__DEVICE__ bool isunordered(float, float);
__DEVICE__ long labs(long);
__DEVICE__ long labs(long) __NOEXCEPT;
__DEVICE__ double ldexp(double, int);
__DEVICE__ float ldexp(float, int);
__DEVICE__ double lgamma(double);
__DEVICE__ float lgamma(float);
__DEVICE__ long long llabs(long long);
__DEVICE__ long long llabs(long long) __NOEXCEPT;
__DEVICE__ long long llrint(double);
__DEVICE__ long long llrint(float);
__DEVICE__ double log10(double);
@@ -134,6 +152,9 @@ __DEVICE__ double log2(double);
__DEVICE__ float log2(float);
__DEVICE__ double logb(double);
__DEVICE__ float logb(float);
#if defined(_OPENMP) && defined(__cplusplus)
__DEVICE__ long double log(long double);
#endif
__DEVICE__ double log(double);
__DEVICE__ float log(float);
__DEVICE__ long lrint(double);
@@ -281,6 +302,7 @@ _GLIBCXX_END_NAMESPACE_VERSION
} // namespace std
#endif
#undef __NOEXCEPT
#pragma pop_macro("__DEVICE__")
#endif
+14 -18
View File
@@ -1,22 +1,8 @@
/*===---- __clang_cuda_runtime_wrapper.h - CUDA runtime support -------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
@@ -62,7 +48,7 @@
#include "cuda.h"
#if !defined(CUDA_VERSION)
#error "cuda.h did not define CUDA_VERSION"
#elif CUDA_VERSION < 7000 || CUDA_VERSION > 10000
#elif CUDA_VERSION < 7000 || CUDA_VERSION > 10010
#error "Unsupported CUDA version!"
#endif
@@ -426,5 +412,15 @@ __device__ inline __cuda_builtin_gridDim_t::operator dim3() const {
#pragma pop_macro("__USE_FAST_MATH__")
#pragma pop_macro("__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__")
// CUDA runtime uses this undocumented function to access kernel launch
// configuration. The declaration is in crt/device_functions.h but that file
// includes a lot of other stuff we don't want. Instead, we'll provide our own
// declaration for it here.
#if CUDA_VERSION >= 9020
extern "C" unsigned __cudaPushCallConfiguration(dim3 gridDim, dim3 blockDim,
size_t sharedMem = 0,
void *stream = 0);
#endif
#endif // __CUDA__
#endif // __CLANG_CUDA_RUNTIME_WRAPPER_H__
+3 -19
View File
@@ -1,24 +1,8 @@
/*===---- __stddef_max_align_t.h - Definition of max_align_t for modules ---===
*
* Copyright (c) 2014 Chandler Carruth
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+3 -17
View File
@@ -1,22 +1,8 @@
/*===---- __wmmintrin_aes.h - AES intrinsics -------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+3 -17
View File
@@ -1,22 +1,8 @@
/*===---- __wmmintrin_pclmul.h - PCMUL intrinsics ---------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+3 -17
View File
@@ -1,22 +1,8 @@
/*===---- adxintrin.h - ADX intrinsics -------------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+3 -17
View File
@@ -1,22 +1,8 @@
/*===---- altivec.h - Standard header for type generic math ---------------===*\
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
\*===----------------------------------------------------------------------===*/
+3 -17
View File
@@ -1,22 +1,8 @@
/*===---- ammintrin.h - SSE4a intrinsics -----------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+3 -17
View File
@@ -1,22 +1,8 @@
/*===---- arm64intr.h - ARM64 Windows intrinsics -------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+21 -17
View File
@@ -1,22 +1,8 @@
/*===---- arm_acle.h - ARM Non-Neon intrinsics -----------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
@@ -611,6 +597,14 @@ __crc32cd(uint32_t __a, uint64_t __b) {
}
#endif
/* Armv8.3-A Javascript conversion intrinsic */
#if __ARM_64BIT_STATE && defined(__ARM_FEATURE_JCVT)
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__jcvt(double __a) {
return __builtin_arm_jcvt(__a);
}
#endif
/* 10.1 Special register intrinsics */
#define __arm_rsr(sysreg) __builtin_arm_rsr(sysreg)
#define __arm_rsr64(sysreg) __builtin_arm_rsr64(sysreg)
@@ -619,6 +613,16 @@ __crc32cd(uint32_t __a, uint64_t __b) {
#define __arm_wsr64(sysreg, v) __builtin_arm_wsr64(sysreg, v)
#define __arm_wsrp(sysreg, v) __builtin_arm_wsrp(sysreg, v)
// Memory Tagging Extensions (MTE) Intrinsics
#if __ARM_FEATURE_MEMORY_TAGGING
#define __arm_mte_create_random_tag(__ptr, __mask) __builtin_arm_irg(__ptr, __mask)
#define __arm_mte_increment_tag(__ptr, __tag_offset) __builtin_arm_addg(__ptr, __tag_offset)
#define __arm_mte_exclude_tag(__ptr, __excluded) __builtin_arm_gmi(__ptr, __excluded)
#define __arm_mte_get_tag(__ptr) __builtin_arm_ldg(__ptr)
#define __arm_mte_set_tag(__ptr) __builtin_arm_stg(__ptr)
#define __arm_mte_ptrdiff(__ptra, __ptrb) __builtin_arm_subp(__ptra, __ptrb)
#endif
#if defined(__cplusplus)
}
#endif
+198 -198
View File
@@ -44247,13 +44247,13 @@ __ai float32x2_t vfms_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2)
#endif
#if defined(__ARM_FEATURE_FP16FML) && defined(__aarch64__)
#ifdef __LITTLE_ENDIAN__
__ai float32x4_t vfmlalq_high_u32(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
__ai float32x4_t vfmlalq_high_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
float32x4_t __ret;
__ret = (float32x4_t) __builtin_neon_vfmlalq_high_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
return __ret;
}
#else
__ai float32x4_t vfmlalq_high_u32(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
__ai float32x4_t vfmlalq_high_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
float32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
float16x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -44262,7 +44262,7 @@ __ai float32x4_t vfmlalq_high_u32(float32x4_t __p0, float16x8_t __p1, float16x8_
__ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
return __ret;
}
__ai float32x4_t __noswap_vfmlalq_high_u32(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
__ai float32x4_t __noswap_vfmlalq_high_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
float32x4_t __ret;
__ret = (float32x4_t) __builtin_neon_vfmlalq_high_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
return __ret;
@@ -44270,13 +44270,13 @@ __ai float32x4_t __noswap_vfmlalq_high_u32(float32x4_t __p0, float16x8_t __p1, f
#endif
#ifdef __LITTLE_ENDIAN__
__ai float32x2_t vfmlal_high_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
__ai float32x2_t vfmlal_high_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
float32x2_t __ret;
__ret = (float32x2_t) __builtin_neon_vfmlal_high_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
return __ret;
}
#else
__ai float32x2_t vfmlal_high_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
__ai float32x2_t vfmlal_high_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
float32x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
float16x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
@@ -44285,7 +44285,7 @@ __ai float32x2_t vfmlal_high_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t
__ret = __builtin_shufflevector(__ret, __ret, 1, 0);
return __ret;
}
__ai float32x2_t __noswap_vfmlal_high_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
__ai float32x2_t __noswap_vfmlal_high_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
float32x2_t __ret;
__ret = (float32x2_t) __builtin_neon_vfmlal_high_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
return __ret;
@@ -44293,13 +44293,13 @@ __ai float32x2_t __noswap_vfmlal_high_u32(float32x2_t __p0, float16x4_t __p1, fl
#endif
#ifdef __LITTLE_ENDIAN__
__ai float32x4_t vfmlalq_low_u32(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
__ai float32x4_t vfmlalq_low_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
float32x4_t __ret;
__ret = (float32x4_t) __builtin_neon_vfmlalq_low_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
return __ret;
}
#else
__ai float32x4_t vfmlalq_low_u32(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
__ai float32x4_t vfmlalq_low_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
float32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
float16x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -44308,7 +44308,7 @@ __ai float32x4_t vfmlalq_low_u32(float32x4_t __p0, float16x8_t __p1, float16x8_t
__ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
return __ret;
}
__ai float32x4_t __noswap_vfmlalq_low_u32(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
__ai float32x4_t __noswap_vfmlalq_low_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
float32x4_t __ret;
__ret = (float32x4_t) __builtin_neon_vfmlalq_low_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
return __ret;
@@ -44316,13 +44316,13 @@ __ai float32x4_t __noswap_vfmlalq_low_u32(float32x4_t __p0, float16x8_t __p1, fl
#endif
#ifdef __LITTLE_ENDIAN__
__ai float32x2_t vfmlal_low_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
__ai float32x2_t vfmlal_low_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
float32x2_t __ret;
__ret = (float32x2_t) __builtin_neon_vfmlal_low_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
return __ret;
}
#else
__ai float32x2_t vfmlal_low_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
__ai float32x2_t vfmlal_low_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
float32x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
float16x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
@@ -44331,7 +44331,7 @@ __ai float32x2_t vfmlal_low_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t
__ret = __builtin_shufflevector(__ret, __ret, 1, 0);
return __ret;
}
__ai float32x2_t __noswap_vfmlal_low_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
__ai float32x2_t __noswap_vfmlal_low_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
float32x2_t __ret;
__ret = (float32x2_t) __builtin_neon_vfmlal_low_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
return __ret;
@@ -44339,13 +44339,13 @@ __ai float32x2_t __noswap_vfmlal_low_u32(float32x2_t __p0, float16x4_t __p1, flo
#endif
#ifdef __LITTLE_ENDIAN__
__ai float32x4_t vfmlslq_high_u32(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
__ai float32x4_t vfmlslq_high_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
float32x4_t __ret;
__ret = (float32x4_t) __builtin_neon_vfmlslq_high_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
return __ret;
}
#else
__ai float32x4_t vfmlslq_high_u32(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
__ai float32x4_t vfmlslq_high_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
float32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
float16x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -44354,7 +44354,7 @@ __ai float32x4_t vfmlslq_high_u32(float32x4_t __p0, float16x8_t __p1, float16x8_
__ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
return __ret;
}
__ai float32x4_t __noswap_vfmlslq_high_u32(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
__ai float32x4_t __noswap_vfmlslq_high_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
float32x4_t __ret;
__ret = (float32x4_t) __builtin_neon_vfmlslq_high_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
return __ret;
@@ -44362,13 +44362,13 @@ __ai float32x4_t __noswap_vfmlslq_high_u32(float32x4_t __p0, float16x8_t __p1, f
#endif
#ifdef __LITTLE_ENDIAN__
__ai float32x2_t vfmlsl_high_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
__ai float32x2_t vfmlsl_high_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
float32x2_t __ret;
__ret = (float32x2_t) __builtin_neon_vfmlsl_high_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
return __ret;
}
#else
__ai float32x2_t vfmlsl_high_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
__ai float32x2_t vfmlsl_high_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
float32x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
float16x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
@@ -44377,7 +44377,7 @@ __ai float32x2_t vfmlsl_high_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t
__ret = __builtin_shufflevector(__ret, __ret, 1, 0);
return __ret;
}
__ai float32x2_t __noswap_vfmlsl_high_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
__ai float32x2_t __noswap_vfmlsl_high_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
float32x2_t __ret;
__ret = (float32x2_t) __builtin_neon_vfmlsl_high_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
return __ret;
@@ -44385,13 +44385,13 @@ __ai float32x2_t __noswap_vfmlsl_high_u32(float32x2_t __p0, float16x4_t __p1, fl
#endif
#ifdef __LITTLE_ENDIAN__
__ai float32x4_t vfmlslq_low_u32(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
__ai float32x4_t vfmlslq_low_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
float32x4_t __ret;
__ret = (float32x4_t) __builtin_neon_vfmlslq_low_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
return __ret;
}
#else
__ai float32x4_t vfmlslq_low_u32(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
__ai float32x4_t vfmlslq_low_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
float32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
float16x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -44400,7 +44400,7 @@ __ai float32x4_t vfmlslq_low_u32(float32x4_t __p0, float16x8_t __p1, float16x8_t
__ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
return __ret;
}
__ai float32x4_t __noswap_vfmlslq_low_u32(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
__ai float32x4_t __noswap_vfmlslq_low_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
float32x4_t __ret;
__ret = (float32x4_t) __builtin_neon_vfmlslq_low_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
return __ret;
@@ -44408,13 +44408,13 @@ __ai float32x4_t __noswap_vfmlslq_low_u32(float32x4_t __p0, float16x8_t __p1, fl
#endif
#ifdef __LITTLE_ENDIAN__
__ai float32x2_t vfmlsl_low_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
__ai float32x2_t vfmlsl_low_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
float32x2_t __ret;
__ret = (float32x2_t) __builtin_neon_vfmlsl_low_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
return __ret;
}
#else
__ai float32x2_t vfmlsl_low_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
__ai float32x2_t vfmlsl_low_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
float32x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
float16x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
@@ -44423,7 +44423,7 @@ __ai float32x2_t vfmlsl_low_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t
__ret = __builtin_shufflevector(__ret, __ret, 1, 0);
return __ret;
}
__ai float32x2_t __noswap_vfmlsl_low_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
__ai float32x2_t __noswap_vfmlsl_low_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
float32x2_t __ret;
__ret = (float32x2_t) __builtin_neon_vfmlsl_low_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
return __ret;
@@ -64095,15 +64095,15 @@ __ai uint8x16_t vqtbl1q_u8(uint8x16_t __p0, uint8x16_t __p1) {
#endif
#ifdef __LITTLE_ENDIAN__
__ai int8x16_t vqtbl1q_s8(int8x16_t __p0, int8x16_t __p1) {
__ai int8x16_t vqtbl1q_s8(int8x16_t __p0, uint8x16_t __p1) {
int8x16_t __ret;
__ret = (int8x16_t) __builtin_neon_vqtbl1q_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
return __ret;
}
#else
__ai int8x16_t vqtbl1q_s8(int8x16_t __p0, int8x16_t __p1) {
__ai int8x16_t vqtbl1q_s8(int8x16_t __p0, uint8x16_t __p1) {
int8x16_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
int8x16_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
uint8x16_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
int8x16_t __ret;
__ret = (int8x16_t) __builtin_neon_vqtbl1q_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
__ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -64129,15 +64129,15 @@ __ai uint8x8_t vqtbl1_u8(uint8x16_t __p0, uint8x8_t __p1) {
#endif
#ifdef __LITTLE_ENDIAN__
__ai int8x8_t vqtbl1_s8(int8x16_t __p0, int8x8_t __p1) {
__ai int8x8_t vqtbl1_s8(int8x16_t __p0, uint8x8_t __p1) {
int8x8_t __ret;
__ret = (int8x8_t) __builtin_neon_vqtbl1_v((int8x16_t)__p0, (int8x8_t)__p1, 0);
return __ret;
}
#else
__ai int8x8_t vqtbl1_s8(int8x16_t __p0, int8x8_t __p1) {
__ai int8x8_t vqtbl1_s8(int8x16_t __p0, uint8x8_t __p1) {
int8x16_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
int8x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
uint8x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
int8x8_t __ret;
__ret = (int8x8_t) __builtin_neon_vqtbl1_v((int8x16_t)__rev0, (int8x8_t)__rev1, 0);
__ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -64203,17 +64203,17 @@ __ai uint8x16_t vqtbl2q_u8(uint8x16x2_t __p0, uint8x16_t __p1) {
#endif
#ifdef __LITTLE_ENDIAN__
__ai int8x16_t vqtbl2q_s8(int8x16x2_t __p0, int8x16_t __p1) {
__ai int8x16_t vqtbl2q_s8(int8x16x2_t __p0, uint8x16_t __p1) {
int8x16_t __ret;
__ret = (int8x16_t) __builtin_neon_vqtbl2q_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p1, 32);
return __ret;
}
#else
__ai int8x16_t vqtbl2q_s8(int8x16x2_t __p0, int8x16_t __p1) {
__ai int8x16_t vqtbl2q_s8(int8x16x2_t __p0, uint8x16_t __p1) {
int8x16x2_t __rev0;
__rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
__rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
int8x16_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
uint8x16_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
int8x16_t __ret;
__ret = (int8x16_t) __builtin_neon_vqtbl2q_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev1, 32);
__ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -64241,17 +64241,17 @@ __ai uint8x8_t vqtbl2_u8(uint8x16x2_t __p0, uint8x8_t __p1) {
#endif
#ifdef __LITTLE_ENDIAN__
__ai int8x8_t vqtbl2_s8(int8x16x2_t __p0, int8x8_t __p1) {
__ai int8x8_t vqtbl2_s8(int8x16x2_t __p0, uint8x8_t __p1) {
int8x8_t __ret;
__ret = (int8x8_t) __builtin_neon_vqtbl2_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x8_t)__p1, 0);
return __ret;
}
#else
__ai int8x8_t vqtbl2_s8(int8x16x2_t __p0, int8x8_t __p1) {
__ai int8x8_t vqtbl2_s8(int8x16x2_t __p0, uint8x8_t __p1) {
int8x16x2_t __rev0;
__rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
__rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
int8x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
uint8x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
int8x8_t __ret;
__ret = (int8x8_t) __builtin_neon_vqtbl2_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x8_t)__rev1, 0);
__ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -64320,18 +64320,18 @@ __ai uint8x16_t vqtbl3q_u8(uint8x16x3_t __p0, uint8x16_t __p1) {
#endif
#ifdef __LITTLE_ENDIAN__
__ai int8x16_t vqtbl3q_s8(int8x16x3_t __p0, int8x16_t __p1) {
__ai int8x16_t vqtbl3q_s8(int8x16x3_t __p0, uint8x16_t __p1) {
int8x16_t __ret;
__ret = (int8x16_t) __builtin_neon_vqtbl3q_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x16_t)__p1, 32);
return __ret;
}
#else
__ai int8x16_t vqtbl3q_s8(int8x16x3_t __p0, int8x16_t __p1) {
__ai int8x16_t vqtbl3q_s8(int8x16x3_t __p0, uint8x16_t __p1) {
int8x16x3_t __rev0;
__rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
__rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
__rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
int8x16_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
uint8x16_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
int8x16_t __ret;
__ret = (int8x16_t) __builtin_neon_vqtbl3q_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x16_t)__rev1, 32);
__ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -64360,18 +64360,18 @@ __ai uint8x8_t vqtbl3_u8(uint8x16x3_t __p0, uint8x8_t __p1) {
#endif
#ifdef __LITTLE_ENDIAN__
__ai int8x8_t vqtbl3_s8(int8x16x3_t __p0, int8x8_t __p1) {
__ai int8x8_t vqtbl3_s8(int8x16x3_t __p0, uint8x8_t __p1) {
int8x8_t __ret;
__ret = (int8x8_t) __builtin_neon_vqtbl3_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x8_t)__p1, 0);
return __ret;
}
#else
__ai int8x8_t vqtbl3_s8(int8x16x3_t __p0, int8x8_t __p1) {
__ai int8x8_t vqtbl3_s8(int8x16x3_t __p0, uint8x8_t __p1) {
int8x16x3_t __rev0;
__rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
__rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
__rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
int8x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
uint8x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
int8x8_t __ret;
__ret = (int8x8_t) __builtin_neon_vqtbl3_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x8_t)__rev1, 0);
__ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -64443,19 +64443,19 @@ __ai uint8x16_t vqtbl4q_u8(uint8x16x4_t __p0, uint8x16_t __p1) {
#endif
#ifdef __LITTLE_ENDIAN__
__ai int8x16_t vqtbl4q_s8(int8x16x4_t __p0, int8x16_t __p1) {
__ai int8x16_t vqtbl4q_s8(int8x16x4_t __p0, uint8x16_t __p1) {
int8x16_t __ret;
__ret = (int8x16_t) __builtin_neon_vqtbl4q_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x16_t)__p0.val[3], (int8x16_t)__p1, 32);
return __ret;
}
#else
__ai int8x16_t vqtbl4q_s8(int8x16x4_t __p0, int8x16_t __p1) {
__ai int8x16_t vqtbl4q_s8(int8x16x4_t __p0, uint8x16_t __p1) {
int8x16x4_t __rev0;
__rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
__rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
__rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
__rev0.val[3] = __builtin_shufflevector(__p0.val[3], __p0.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
int8x16_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
uint8x16_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
int8x16_t __ret;
__ret = (int8x16_t) __builtin_neon_vqtbl4q_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x16_t)__rev0.val[3], (int8x16_t)__rev1, 32);
__ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -64485,19 +64485,19 @@ __ai uint8x8_t vqtbl4_u8(uint8x16x4_t __p0, uint8x8_t __p1) {
#endif
#ifdef __LITTLE_ENDIAN__
__ai int8x8_t vqtbl4_s8(int8x16x4_t __p0, int8x8_t __p1) {
__ai int8x8_t vqtbl4_s8(int8x16x4_t __p0, uint8x8_t __p1) {
int8x8_t __ret;
__ret = (int8x8_t) __builtin_neon_vqtbl4_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x16_t)__p0.val[3], (int8x8_t)__p1, 0);
return __ret;
}
#else
__ai int8x8_t vqtbl4_s8(int8x16x4_t __p0, int8x8_t __p1) {
__ai int8x8_t vqtbl4_s8(int8x16x4_t __p0, uint8x8_t __p1) {
int8x16x4_t __rev0;
__rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
__rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
__rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
__rev0.val[3] = __builtin_shufflevector(__p0.val[3], __p0.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
int8x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
uint8x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
int8x8_t __ret;
__ret = (int8x8_t) __builtin_neon_vqtbl4_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x16_t)__rev0.val[3], (int8x8_t)__rev1, 0);
__ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -64560,16 +64560,16 @@ __ai uint8x16_t vqtbx1q_u8(uint8x16_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
#endif
#ifdef __LITTLE_ENDIAN__
__ai int8x16_t vqtbx1q_s8(int8x16_t __p0, int8x16_t __p1, int8x16_t __p2) {
__ai int8x16_t vqtbx1q_s8(int8x16_t __p0, int8x16_t __p1, uint8x16_t __p2) {
int8x16_t __ret;
__ret = (int8x16_t) __builtin_neon_vqtbx1q_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 32);
return __ret;
}
#else
__ai int8x16_t vqtbx1q_s8(int8x16_t __p0, int8x16_t __p1, int8x16_t __p2) {
__ai int8x16_t vqtbx1q_s8(int8x16_t __p0, int8x16_t __p1, uint8x16_t __p2) {
int8x16_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
int8x16_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
int8x16_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
uint8x16_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
int8x16_t __ret;
__ret = (int8x16_t) __builtin_neon_vqtbx1q_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 32);
__ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -64596,16 +64596,16 @@ __ai uint8x8_t vqtbx1_u8(uint8x8_t __p0, uint8x16_t __p1, uint8x8_t __p2) {
#endif
#ifdef __LITTLE_ENDIAN__
__ai int8x8_t vqtbx1_s8(int8x8_t __p0, int8x16_t __p1, int8x8_t __p2) {
__ai int8x8_t vqtbx1_s8(int8x8_t __p0, int8x16_t __p1, uint8x8_t __p2) {
int8x8_t __ret;
__ret = (int8x8_t) __builtin_neon_vqtbx1_v((int8x8_t)__p0, (int8x16_t)__p1, (int8x8_t)__p2, 0);
return __ret;
}
#else
__ai int8x8_t vqtbx1_s8(int8x8_t __p0, int8x16_t __p1, int8x8_t __p2) {
__ai int8x8_t vqtbx1_s8(int8x8_t __p0, int8x16_t __p1, uint8x8_t __p2) {
int8x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
int8x16_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
int8x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
uint8x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
int8x8_t __ret;
__ret = (int8x8_t) __builtin_neon_vqtbx1_v((int8x8_t)__rev0, (int8x16_t)__rev1, (int8x8_t)__rev2, 0);
__ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -64674,18 +64674,18 @@ __ai uint8x16_t vqtbx2q_u8(uint8x16_t __p0, uint8x16x2_t __p1, uint8x16_t __p2)
#endif
#ifdef __LITTLE_ENDIAN__
__ai int8x16_t vqtbx2q_s8(int8x16_t __p0, int8x16x2_t __p1, int8x16_t __p2) {
__ai int8x16_t vqtbx2q_s8(int8x16_t __p0, int8x16x2_t __p1, uint8x16_t __p2) {
int8x16_t __ret;
__ret = (int8x16_t) __builtin_neon_vqtbx2q_v((int8x16_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p2, 32);
return __ret;
}
#else
__ai int8x16_t vqtbx2q_s8(int8x16_t __p0, int8x16x2_t __p1, int8x16_t __p2) {
__ai int8x16_t vqtbx2q_s8(int8x16_t __p0, int8x16x2_t __p1, uint8x16_t __p2) {
int8x16_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
int8x16x2_t __rev1;
__rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
__rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
int8x16_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
uint8x16_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
int8x16_t __ret;
__ret = (int8x16_t) __builtin_neon_vqtbx2q_v((int8x16_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev2, 32);
__ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -64714,18 +64714,18 @@ __ai uint8x8_t vqtbx2_u8(uint8x8_t __p0, uint8x16x2_t __p1, uint8x8_t __p2) {
#endif
#ifdef __LITTLE_ENDIAN__
__ai int8x8_t vqtbx2_s8(int8x8_t __p0, int8x16x2_t __p1, int8x8_t __p2) {
__ai int8x8_t vqtbx2_s8(int8x8_t __p0, int8x16x2_t __p1, uint8x8_t __p2) {
int8x8_t __ret;
__ret = (int8x8_t) __builtin_neon_vqtbx2_v((int8x8_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x8_t)__p2, 0);
return __ret;
}
#else
__ai int8x8_t vqtbx2_s8(int8x8_t __p0, int8x16x2_t __p1, int8x8_t __p2) {
__ai int8x8_t vqtbx2_s8(int8x8_t __p0, int8x16x2_t __p1, uint8x8_t __p2) {
int8x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
int8x16x2_t __rev1;
__rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
__rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
int8x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
uint8x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
int8x8_t __ret;
__ret = (int8x8_t) __builtin_neon_vqtbx2_v((int8x8_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x8_t)__rev2, 0);
__ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -64797,19 +64797,19 @@ __ai uint8x16_t vqtbx3q_u8(uint8x16_t __p0, uint8x16x3_t __p1, uint8x16_t __p2)
#endif
#ifdef __LITTLE_ENDIAN__
__ai int8x16_t vqtbx3q_s8(int8x16_t __p0, int8x16x3_t __p1, int8x16_t __p2) {
__ai int8x16_t vqtbx3q_s8(int8x16_t __p0, int8x16x3_t __p1, uint8x16_t __p2) {
int8x16_t __ret;
__ret = (int8x16_t) __builtin_neon_vqtbx3q_v((int8x16_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x16_t)__p2, 32);
return __ret;
}
#else
__ai int8x16_t vqtbx3q_s8(int8x16_t __p0, int8x16x3_t __p1, int8x16_t __p2) {
__ai int8x16_t vqtbx3q_s8(int8x16_t __p0, int8x16x3_t __p1, uint8x16_t __p2) {
int8x16_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
int8x16x3_t __rev1;
__rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
__rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
__rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
int8x16_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
uint8x16_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
int8x16_t __ret;
__ret = (int8x16_t) __builtin_neon_vqtbx3q_v((int8x16_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev2, 32);
__ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -64839,19 +64839,19 @@ __ai uint8x8_t vqtbx3_u8(uint8x8_t __p0, uint8x16x3_t __p1, uint8x8_t __p2) {
#endif
#ifdef __LITTLE_ENDIAN__
__ai int8x8_t vqtbx3_s8(int8x8_t __p0, int8x16x3_t __p1, int8x8_t __p2) {
__ai int8x8_t vqtbx3_s8(int8x8_t __p0, int8x16x3_t __p1, uint8x8_t __p2) {
int8x8_t __ret;
__ret = (int8x8_t) __builtin_neon_vqtbx3_v((int8x8_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x8_t)__p2, 0);
return __ret;
}
#else
__ai int8x8_t vqtbx3_s8(int8x8_t __p0, int8x16x3_t __p1, int8x8_t __p2) {
__ai int8x8_t vqtbx3_s8(int8x8_t __p0, int8x16x3_t __p1, uint8x8_t __p2) {
int8x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
int8x16x3_t __rev1;
__rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
__rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
__rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
int8x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
uint8x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
int8x8_t __ret;
__ret = (int8x8_t) __builtin_neon_vqtbx3_v((int8x8_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x8_t)__rev2, 0);
__ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -64926,20 +64926,20 @@ __ai uint8x16_t vqtbx4q_u8(uint8x16_t __p0, uint8x16x4_t __p1, uint8x16_t __p2)
#endif
#ifdef __LITTLE_ENDIAN__
__ai int8x16_t vqtbx4q_s8(int8x16_t __p0, int8x16x4_t __p1, int8x16_t __p2) {
__ai int8x16_t vqtbx4q_s8(int8x16_t __p0, int8x16x4_t __p1, uint8x16_t __p2) {
int8x16_t __ret;
__ret = (int8x16_t) __builtin_neon_vqtbx4q_v((int8x16_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x16_t)__p1.val[3], (int8x16_t)__p2, 32);
return __ret;
}
#else
__ai int8x16_t vqtbx4q_s8(int8x16_t __p0, int8x16x4_t __p1, int8x16_t __p2) {
__ai int8x16_t vqtbx4q_s8(int8x16_t __p0, int8x16x4_t __p1, uint8x16_t __p2) {
int8x16_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
int8x16x4_t __rev1;
__rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
__rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
__rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
__rev1.val[3] = __builtin_shufflevector(__p1.val[3], __p1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
int8x16_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
uint8x16_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
int8x16_t __ret;
__ret = (int8x16_t) __builtin_neon_vqtbx4q_v((int8x16_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], (int8x16_t)__rev2, 32);
__ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -64970,20 +64970,20 @@ __ai uint8x8_t vqtbx4_u8(uint8x8_t __p0, uint8x16x4_t __p1, uint8x8_t __p2) {
#endif
#ifdef __LITTLE_ENDIAN__
__ai int8x8_t vqtbx4_s8(int8x8_t __p0, int8x16x4_t __p1, int8x8_t __p2) {
__ai int8x8_t vqtbx4_s8(int8x8_t __p0, int8x16x4_t __p1, uint8x8_t __p2) {
int8x8_t __ret;
__ret = (int8x8_t) __builtin_neon_vqtbx4_v((int8x8_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x16_t)__p1.val[3], (int8x8_t)__p2, 0);
return __ret;
}
#else
__ai int8x8_t vqtbx4_s8(int8x8_t __p0, int8x16x4_t __p1, int8x8_t __p2) {
__ai int8x8_t vqtbx4_s8(int8x8_t __p0, int8x16x4_t __p1, uint8x8_t __p2) {
int8x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
int8x16x4_t __rev1;
__rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
__rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
__rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
__rev1.val[3] = __builtin_shufflevector(__p1.val[3], __p1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
int8x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
uint8x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
int8x8_t __ret;
__ret = (int8x8_t) __builtin_neon_vqtbx4_v((int8x8_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], (int8x8_t)__rev2, 0);
__ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -66293,13 +66293,13 @@ __ai int64_t vshld_s64(int64_t __p0, int64_t __p1) {
#endif
#ifdef __LITTLE_ENDIAN__
__ai uint8_t vsqaddb_u8(uint8_t __p0, uint8_t __p1) {
__ai uint8_t vsqaddb_u8(uint8_t __p0, int8_t __p1) {
uint8_t __ret;
__ret = (uint8_t) __builtin_neon_vsqaddb_u8(__p0, __p1);
return __ret;
}
#else
__ai uint8_t vsqaddb_u8(uint8_t __p0, uint8_t __p1) {
__ai uint8_t vsqaddb_u8(uint8_t __p0, int8_t __p1) {
uint8_t __ret;
__ret = (uint8_t) __builtin_neon_vsqaddb_u8(__p0, __p1);
return __ret;
@@ -66307,13 +66307,13 @@ __ai uint8_t vsqaddb_u8(uint8_t __p0, uint8_t __p1) {
#endif
#ifdef __LITTLE_ENDIAN__
__ai uint32_t vsqadds_u32(uint32_t __p0, uint32_t __p1) {
__ai uint32_t vsqadds_u32(uint32_t __p0, int32_t __p1) {
uint32_t __ret;
__ret = (uint32_t) __builtin_neon_vsqadds_u32(__p0, __p1);
return __ret;
}
#else
__ai uint32_t vsqadds_u32(uint32_t __p0, uint32_t __p1) {
__ai uint32_t vsqadds_u32(uint32_t __p0, int32_t __p1) {
uint32_t __ret;
__ret = (uint32_t) __builtin_neon_vsqadds_u32(__p0, __p1);
return __ret;
@@ -66321,13 +66321,13 @@ __ai uint32_t vsqadds_u32(uint32_t __p0, uint32_t __p1) {
#endif
#ifdef __LITTLE_ENDIAN__
__ai uint64_t vsqaddd_u64(uint64_t __p0, uint64_t __p1) {
__ai uint64_t vsqaddd_u64(uint64_t __p0, int64_t __p1) {
uint64_t __ret;
__ret = (uint64_t) __builtin_neon_vsqaddd_u64(__p0, __p1);
return __ret;
}
#else
__ai uint64_t vsqaddd_u64(uint64_t __p0, uint64_t __p1) {
__ai uint64_t vsqaddd_u64(uint64_t __p0, int64_t __p1) {
uint64_t __ret;
__ret = (uint64_t) __builtin_neon_vsqaddd_u64(__p0, __p1);
return __ret;
@@ -66335,13 +66335,13 @@ __ai uint64_t vsqaddd_u64(uint64_t __p0, uint64_t __p1) {
#endif
#ifdef __LITTLE_ENDIAN__
__ai uint16_t vsqaddh_u16(uint16_t __p0, uint16_t __p1) {
__ai uint16_t vsqaddh_u16(uint16_t __p0, int16_t __p1) {
uint16_t __ret;
__ret = (uint16_t) __builtin_neon_vsqaddh_u16(__p0, __p1);
return __ret;
}
#else
__ai uint16_t vsqaddh_u16(uint16_t __p0, uint16_t __p1) {
__ai uint16_t vsqaddh_u16(uint16_t __p0, int16_t __p1) {
uint16_t __ret;
__ret = (uint16_t) __builtin_neon_vsqaddh_u16(__p0, __p1);
return __ret;
@@ -66349,15 +66349,15 @@ __ai uint16_t vsqaddh_u16(uint16_t __p0, uint16_t __p1) {
#endif
#ifdef __LITTLE_ENDIAN__
__ai uint8x16_t vsqaddq_u8(uint8x16_t __p0, uint8x16_t __p1) {
__ai uint8x16_t vsqaddq_u8(uint8x16_t __p0, int8x16_t __p1) {
uint8x16_t __ret;
__ret = (uint8x16_t) __builtin_neon_vsqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
return __ret;
}
#else
__ai uint8x16_t vsqaddq_u8(uint8x16_t __p0, uint8x16_t __p1) {
__ai uint8x16_t vsqaddq_u8(uint8x16_t __p0, int8x16_t __p1) {
uint8x16_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
uint8x16_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
int8x16_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
uint8x16_t __ret;
__ret = (uint8x16_t) __builtin_neon_vsqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
__ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -66366,15 +66366,15 @@ __ai uint8x16_t vsqaddq_u8(uint8x16_t __p0, uint8x16_t __p1) {
#endif
#ifdef __LITTLE_ENDIAN__
__ai uint32x4_t vsqaddq_u32(uint32x4_t __p0, uint32x4_t __p1) {
__ai uint32x4_t vsqaddq_u32(uint32x4_t __p0, int32x4_t __p1) {
uint32x4_t __ret;
__ret = (uint32x4_t) __builtin_neon_vsqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
return __ret;
}
#else
__ai uint32x4_t vsqaddq_u32(uint32x4_t __p0, uint32x4_t __p1) {
__ai uint32x4_t vsqaddq_u32(uint32x4_t __p0, int32x4_t __p1) {
uint32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
uint32x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
int32x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
uint32x4_t __ret;
__ret = (uint32x4_t) __builtin_neon_vsqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
__ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
@@ -66383,15 +66383,15 @@ __ai uint32x4_t vsqaddq_u32(uint32x4_t __p0, uint32x4_t __p1) {
#endif
#ifdef __LITTLE_ENDIAN__
__ai uint64x2_t vsqaddq_u64(uint64x2_t __p0, uint64x2_t __p1) {
__ai uint64x2_t vsqaddq_u64(uint64x2_t __p0, int64x2_t __p1) {
uint64x2_t __ret;
__ret = (uint64x2_t) __builtin_neon_vsqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
return __ret;
}
#else
__ai uint64x2_t vsqaddq_u64(uint64x2_t __p0, uint64x2_t __p1) {
__ai uint64x2_t vsqaddq_u64(uint64x2_t __p0, int64x2_t __p1) {
uint64x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
uint64x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
int64x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
uint64x2_t __ret;
__ret = (uint64x2_t) __builtin_neon_vsqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
__ret = __builtin_shufflevector(__ret, __ret, 1, 0);
@@ -66400,15 +66400,15 @@ __ai uint64x2_t vsqaddq_u64(uint64x2_t __p0, uint64x2_t __p1) {
#endif
#ifdef __LITTLE_ENDIAN__
__ai uint16x8_t vsqaddq_u16(uint16x8_t __p0, uint16x8_t __p1) {
__ai uint16x8_t vsqaddq_u16(uint16x8_t __p0, int16x8_t __p1) {
uint16x8_t __ret;
__ret = (uint16x8_t) __builtin_neon_vsqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
return __ret;
}
#else
__ai uint16x8_t vsqaddq_u16(uint16x8_t __p0, uint16x8_t __p1) {
__ai uint16x8_t vsqaddq_u16(uint16x8_t __p0, int16x8_t __p1) {
uint16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
uint16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
int16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
uint16x8_t __ret;
__ret = (uint16x8_t) __builtin_neon_vsqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
__ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -66417,15 +66417,15 @@ __ai uint16x8_t vsqaddq_u16(uint16x8_t __p0, uint16x8_t __p1) {
#endif
#ifdef __LITTLE_ENDIAN__
__ai uint8x8_t vsqadd_u8(uint8x8_t __p0, uint8x8_t __p1) {
__ai uint8x8_t vsqadd_u8(uint8x8_t __p0, int8x8_t __p1) {
uint8x8_t __ret;
__ret = (uint8x8_t) __builtin_neon_vsqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
return __ret;
}
#else
__ai uint8x8_t vsqadd_u8(uint8x8_t __p0, uint8x8_t __p1) {
__ai uint8x8_t vsqadd_u8(uint8x8_t __p0, int8x8_t __p1) {
uint8x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
uint8x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
int8x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
uint8x8_t __ret;
__ret = (uint8x8_t) __builtin_neon_vsqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
__ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -66434,15 +66434,15 @@ __ai uint8x8_t vsqadd_u8(uint8x8_t __p0, uint8x8_t __p1) {
#endif
#ifdef __LITTLE_ENDIAN__
__ai uint32x2_t vsqadd_u32(uint32x2_t __p0, uint32x2_t __p1) {
__ai uint32x2_t vsqadd_u32(uint32x2_t __p0, int32x2_t __p1) {
uint32x2_t __ret;
__ret = (uint32x2_t) __builtin_neon_vsqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
return __ret;
}
#else
__ai uint32x2_t vsqadd_u32(uint32x2_t __p0, uint32x2_t __p1) {
__ai uint32x2_t vsqadd_u32(uint32x2_t __p0, int32x2_t __p1) {
uint32x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
uint32x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
int32x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
uint32x2_t __ret;
__ret = (uint32x2_t) __builtin_neon_vsqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
__ret = __builtin_shufflevector(__ret, __ret, 1, 0);
@@ -66451,13 +66451,13 @@ __ai uint32x2_t vsqadd_u32(uint32x2_t __p0, uint32x2_t __p1) {
#endif
#ifdef __LITTLE_ENDIAN__
__ai uint64x1_t vsqadd_u64(uint64x1_t __p0, uint64x1_t __p1) {
__ai uint64x1_t vsqadd_u64(uint64x1_t __p0, int64x1_t __p1) {
uint64x1_t __ret;
__ret = (uint64x1_t) __builtin_neon_vsqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
return __ret;
}
#else
__ai uint64x1_t vsqadd_u64(uint64x1_t __p0, uint64x1_t __p1) {
__ai uint64x1_t vsqadd_u64(uint64x1_t __p0, int64x1_t __p1) {
uint64x1_t __ret;
__ret = (uint64x1_t) __builtin_neon_vsqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
return __ret;
@@ -66465,15 +66465,15 @@ __ai uint64x1_t vsqadd_u64(uint64x1_t __p0, uint64x1_t __p1) {
#endif
#ifdef __LITTLE_ENDIAN__
__ai uint16x4_t vsqadd_u16(uint16x4_t __p0, uint16x4_t __p1) {
__ai uint16x4_t vsqadd_u16(uint16x4_t __p0, int16x4_t __p1) {
uint16x4_t __ret;
__ret = (uint16x4_t) __builtin_neon_vsqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
return __ret;
}
#else
__ai uint16x4_t vsqadd_u16(uint16x4_t __p0, uint16x4_t __p1) {
__ai uint16x4_t vsqadd_u16(uint16x4_t __p0, int16x4_t __p1) {
uint16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
uint16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
int16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
uint16x4_t __ret;
__ret = (uint16x4_t) __builtin_neon_vsqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
__ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
@@ -68919,13 +68919,13 @@ __ai int64_t vtstd_s64(int64_t __p0, int64_t __p1) {
#endif
#ifdef __LITTLE_ENDIAN__
__ai int8_t vuqaddb_s8(int8_t __p0, int8_t __p1) {
__ai int8_t vuqaddb_s8(int8_t __p0, uint8_t __p1) {
int8_t __ret;
__ret = (int8_t) __builtin_neon_vuqaddb_s8(__p0, __p1);
return __ret;
}
#else
__ai int8_t vuqaddb_s8(int8_t __p0, int8_t __p1) {
__ai int8_t vuqaddb_s8(int8_t __p0, uint8_t __p1) {
int8_t __ret;
__ret = (int8_t) __builtin_neon_vuqaddb_s8(__p0, __p1);
return __ret;
@@ -68933,13 +68933,13 @@ __ai int8_t vuqaddb_s8(int8_t __p0, int8_t __p1) {
#endif
#ifdef __LITTLE_ENDIAN__
__ai int32_t vuqadds_s32(int32_t __p0, int32_t __p1) {
__ai int32_t vuqadds_s32(int32_t __p0, uint32_t __p1) {
int32_t __ret;
__ret = (int32_t) __builtin_neon_vuqadds_s32(__p0, __p1);
return __ret;
}
#else
__ai int32_t vuqadds_s32(int32_t __p0, int32_t __p1) {
__ai int32_t vuqadds_s32(int32_t __p0, uint32_t __p1) {
int32_t __ret;
__ret = (int32_t) __builtin_neon_vuqadds_s32(__p0, __p1);
return __ret;
@@ -68947,13 +68947,13 @@ __ai int32_t vuqadds_s32(int32_t __p0, int32_t __p1) {
#endif
#ifdef __LITTLE_ENDIAN__
__ai int64_t vuqaddd_s64(int64_t __p0, int64_t __p1) {
__ai int64_t vuqaddd_s64(int64_t __p0, uint64_t __p1) {
int64_t __ret;
__ret = (int64_t) __builtin_neon_vuqaddd_s64(__p0, __p1);
return __ret;
}
#else
__ai int64_t vuqaddd_s64(int64_t __p0, int64_t __p1) {
__ai int64_t vuqaddd_s64(int64_t __p0, uint64_t __p1) {
int64_t __ret;
__ret = (int64_t) __builtin_neon_vuqaddd_s64(__p0, __p1);
return __ret;
@@ -68961,13 +68961,13 @@ __ai int64_t vuqaddd_s64(int64_t __p0, int64_t __p1) {
#endif
#ifdef __LITTLE_ENDIAN__
__ai int16_t vuqaddh_s16(int16_t __p0, int16_t __p1) {
__ai int16_t vuqaddh_s16(int16_t __p0, uint16_t __p1) {
int16_t __ret;
__ret = (int16_t) __builtin_neon_vuqaddh_s16(__p0, __p1);
return __ret;
}
#else
__ai int16_t vuqaddh_s16(int16_t __p0, int16_t __p1) {
__ai int16_t vuqaddh_s16(int16_t __p0, uint16_t __p1) {
int16_t __ret;
__ret = (int16_t) __builtin_neon_vuqaddh_s16(__p0, __p1);
return __ret;
@@ -68975,15 +68975,15 @@ __ai int16_t vuqaddh_s16(int16_t __p0, int16_t __p1) {
#endif
#ifdef __LITTLE_ENDIAN__
__ai int8x16_t vuqaddq_s8(int8x16_t __p0, int8x16_t __p1) {
__ai int8x16_t vuqaddq_s8(int8x16_t __p0, uint8x16_t __p1) {
int8x16_t __ret;
__ret = (int8x16_t) __builtin_neon_vuqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
return __ret;
}
#else
__ai int8x16_t vuqaddq_s8(int8x16_t __p0, int8x16_t __p1) {
__ai int8x16_t vuqaddq_s8(int8x16_t __p0, uint8x16_t __p1) {
int8x16_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
int8x16_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
uint8x16_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
int8x16_t __ret;
__ret = (int8x16_t) __builtin_neon_vuqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
__ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -68992,15 +68992,15 @@ __ai int8x16_t vuqaddq_s8(int8x16_t __p0, int8x16_t __p1) {
#endif
#ifdef __LITTLE_ENDIAN__
__ai int32x4_t vuqaddq_s32(int32x4_t __p0, int32x4_t __p1) {
__ai int32x4_t vuqaddq_s32(int32x4_t __p0, uint32x4_t __p1) {
int32x4_t __ret;
__ret = (int32x4_t) __builtin_neon_vuqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
return __ret;
}
#else
__ai int32x4_t vuqaddq_s32(int32x4_t __p0, int32x4_t __p1) {
__ai int32x4_t vuqaddq_s32(int32x4_t __p0, uint32x4_t __p1) {
int32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
int32x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
uint32x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
int32x4_t __ret;
__ret = (int32x4_t) __builtin_neon_vuqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
__ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
@@ -69009,15 +69009,15 @@ __ai int32x4_t vuqaddq_s32(int32x4_t __p0, int32x4_t __p1) {
#endif
#ifdef __LITTLE_ENDIAN__
__ai int64x2_t vuqaddq_s64(int64x2_t __p0, int64x2_t __p1) {
__ai int64x2_t vuqaddq_s64(int64x2_t __p0, uint64x2_t __p1) {
int64x2_t __ret;
__ret = (int64x2_t) __builtin_neon_vuqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 35);
return __ret;
}
#else
__ai int64x2_t vuqaddq_s64(int64x2_t __p0, int64x2_t __p1) {
__ai int64x2_t vuqaddq_s64(int64x2_t __p0, uint64x2_t __p1) {
int64x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
int64x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
uint64x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
int64x2_t __ret;
__ret = (int64x2_t) __builtin_neon_vuqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 35);
__ret = __builtin_shufflevector(__ret, __ret, 1, 0);
@@ -69026,15 +69026,15 @@ __ai int64x2_t vuqaddq_s64(int64x2_t __p0, int64x2_t __p1) {
#endif
#ifdef __LITTLE_ENDIAN__
__ai int16x8_t vuqaddq_s16(int16x8_t __p0, int16x8_t __p1) {
__ai int16x8_t vuqaddq_s16(int16x8_t __p0, uint16x8_t __p1) {
int16x8_t __ret;
__ret = (int16x8_t) __builtin_neon_vuqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
return __ret;
}
#else
__ai int16x8_t vuqaddq_s16(int16x8_t __p0, int16x8_t __p1) {
__ai int16x8_t vuqaddq_s16(int16x8_t __p0, uint16x8_t __p1) {
int16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
int16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
uint16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
int16x8_t __ret;
__ret = (int16x8_t) __builtin_neon_vuqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
__ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -69043,15 +69043,15 @@ __ai int16x8_t vuqaddq_s16(int16x8_t __p0, int16x8_t __p1) {
#endif
#ifdef __LITTLE_ENDIAN__
__ai int8x8_t vuqadd_s8(int8x8_t __p0, int8x8_t __p1) {
__ai int8x8_t vuqadd_s8(int8x8_t __p0, uint8x8_t __p1) {
int8x8_t __ret;
__ret = (int8x8_t) __builtin_neon_vuqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
return __ret;
}
#else
__ai int8x8_t vuqadd_s8(int8x8_t __p0, int8x8_t __p1) {
__ai int8x8_t vuqadd_s8(int8x8_t __p0, uint8x8_t __p1) {
int8x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
int8x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
uint8x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
int8x8_t __ret;
__ret = (int8x8_t) __builtin_neon_vuqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
__ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -69060,15 +69060,15 @@ __ai int8x8_t vuqadd_s8(int8x8_t __p0, int8x8_t __p1) {
#endif
#ifdef __LITTLE_ENDIAN__
__ai int32x2_t vuqadd_s32(int32x2_t __p0, int32x2_t __p1) {
__ai int32x2_t vuqadd_s32(int32x2_t __p0, uint32x2_t __p1) {
int32x2_t __ret;
__ret = (int32x2_t) __builtin_neon_vuqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
return __ret;
}
#else
__ai int32x2_t vuqadd_s32(int32x2_t __p0, int32x2_t __p1) {
__ai int32x2_t vuqadd_s32(int32x2_t __p0, uint32x2_t __p1) {
int32x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
int32x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
uint32x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
int32x2_t __ret;
__ret = (int32x2_t) __builtin_neon_vuqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
__ret = __builtin_shufflevector(__ret, __ret, 1, 0);
@@ -69077,13 +69077,13 @@ __ai int32x2_t vuqadd_s32(int32x2_t __p0, int32x2_t __p1) {
#endif
#ifdef __LITTLE_ENDIAN__
__ai int64x1_t vuqadd_s64(int64x1_t __p0, int64x1_t __p1) {
__ai int64x1_t vuqadd_s64(int64x1_t __p0, uint64x1_t __p1) {
int64x1_t __ret;
__ret = (int64x1_t) __builtin_neon_vuqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 3);
return __ret;
}
#else
__ai int64x1_t vuqadd_s64(int64x1_t __p0, int64x1_t __p1) {
__ai int64x1_t vuqadd_s64(int64x1_t __p0, uint64x1_t __p1) {
int64x1_t __ret;
__ret = (int64x1_t) __builtin_neon_vuqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 3);
return __ret;
@@ -69091,15 +69091,15 @@ __ai int64x1_t vuqadd_s64(int64x1_t __p0, int64x1_t __p1) {
#endif
#ifdef __LITTLE_ENDIAN__
__ai int16x4_t vuqadd_s16(int16x4_t __p0, int16x4_t __p1) {
__ai int16x4_t vuqadd_s16(int16x4_t __p0, uint16x4_t __p1) {
int16x4_t __ret;
__ret = (int16x4_t) __builtin_neon_vuqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
return __ret;
}
#else
__ai int16x4_t vuqadd_s16(int16x4_t __p0, int16x4_t __p1) {
__ai int16x4_t vuqadd_s16(int16x4_t __p0, uint16x4_t __p1) {
int16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
int16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
uint16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
int16x4_t __ret;
__ret = (int16x4_t) __builtin_neon_vuqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
__ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
@@ -71912,16 +71912,16 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in
#if defined(__ARM_FEATURE_FP16FML) && defined(__aarch64__)
#ifdef __LITTLE_ENDIAN__
#define vfmlalq_lane_high_u32(__p0_264, __p1_264, __p2_264, __p3_264) __extension__ ({ \
#define vfmlalq_lane_high_f16(__p0_264, __p1_264, __p2_264, __p3_264) __extension__ ({ \
float32x4_t __s0_264 = __p0_264; \
float16x8_t __s1_264 = __p1_264; \
float16x4_t __s2_264 = __p2_264; \
float32x4_t __ret_264; \
__ret_264 = vfmlalq_high_u32(__s0_264, __s1_264, (float16x8_t) {vget_lane_f16(__s2_264, __p3_264), vget_lane_f16(__s2_264, __p3_264), vget_lane_f16(__s2_264, __p3_264), vget_lane_f16(__s2_264, __p3_264), vget_lane_f16(__s2_264, __p3_264), vget_lane_f16(__s2_264, __p3_264), vget_lane_f16(__s2_264, __p3_264), vget_lane_f16(__s2_264, __p3_264)}); \
__ret_264 = vfmlalq_high_f16(__s0_264, __s1_264, (float16x8_t) {vget_lane_f16(__s2_264, __p3_264), vget_lane_f16(__s2_264, __p3_264), vget_lane_f16(__s2_264, __p3_264), vget_lane_f16(__s2_264, __p3_264), vget_lane_f16(__s2_264, __p3_264), vget_lane_f16(__s2_264, __p3_264), vget_lane_f16(__s2_264, __p3_264), vget_lane_f16(__s2_264, __p3_264)}); \
__ret_264; \
})
#else
#define vfmlalq_lane_high_u32(__p0_265, __p1_265, __p2_265, __p3_265) __extension__ ({ \
#define vfmlalq_lane_high_f16(__p0_265, __p1_265, __p2_265, __p3_265) __extension__ ({ \
float32x4_t __s0_265 = __p0_265; \
float16x8_t __s1_265 = __p1_265; \
float16x4_t __s2_265 = __p2_265; \
@@ -71929,23 +71929,23 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in
float16x8_t __rev1_265; __rev1_265 = __builtin_shufflevector(__s1_265, __s1_265, 7, 6, 5, 4, 3, 2, 1, 0); \
float16x4_t __rev2_265; __rev2_265 = __builtin_shufflevector(__s2_265, __s2_265, 3, 2, 1, 0); \
float32x4_t __ret_265; \
__ret_265 = __noswap_vfmlalq_high_u32(__rev0_265, __rev1_265, (float16x8_t) {__noswap_vget_lane_f16(__rev2_265, __p3_265), __noswap_vget_lane_f16(__rev2_265, __p3_265), __noswap_vget_lane_f16(__rev2_265, __p3_265), __noswap_vget_lane_f16(__rev2_265, __p3_265), __noswap_vget_lane_f16(__rev2_265, __p3_265), __noswap_vget_lane_f16(__rev2_265, __p3_265), __noswap_vget_lane_f16(__rev2_265, __p3_265), __noswap_vget_lane_f16(__rev2_265, __p3_265)}); \
__ret_265 = __noswap_vfmlalq_high_f16(__rev0_265, __rev1_265, (float16x8_t) {__noswap_vget_lane_f16(__rev2_265, __p3_265), __noswap_vget_lane_f16(__rev2_265, __p3_265), __noswap_vget_lane_f16(__rev2_265, __p3_265), __noswap_vget_lane_f16(__rev2_265, __p3_265), __noswap_vget_lane_f16(__rev2_265, __p3_265), __noswap_vget_lane_f16(__rev2_265, __p3_265), __noswap_vget_lane_f16(__rev2_265, __p3_265), __noswap_vget_lane_f16(__rev2_265, __p3_265)}); \
__ret_265 = __builtin_shufflevector(__ret_265, __ret_265, 3, 2, 1, 0); \
__ret_265; \
})
#endif
#ifdef __LITTLE_ENDIAN__
#define vfmlal_lane_high_u32(__p0_266, __p1_266, __p2_266, __p3_266) __extension__ ({ \
#define vfmlal_lane_high_f16(__p0_266, __p1_266, __p2_266, __p3_266) __extension__ ({ \
float32x2_t __s0_266 = __p0_266; \
float16x4_t __s1_266 = __p1_266; \
float16x4_t __s2_266 = __p2_266; \
float32x2_t __ret_266; \
__ret_266 = vfmlal_high_u32(__s0_266, __s1_266, (float16x4_t) {vget_lane_f16(__s2_266, __p3_266), vget_lane_f16(__s2_266, __p3_266), vget_lane_f16(__s2_266, __p3_266), vget_lane_f16(__s2_266, __p3_266)}); \
__ret_266 = vfmlal_high_f16(__s0_266, __s1_266, (float16x4_t) {vget_lane_f16(__s2_266, __p3_266), vget_lane_f16(__s2_266, __p3_266), vget_lane_f16(__s2_266, __p3_266), vget_lane_f16(__s2_266, __p3_266)}); \
__ret_266; \
})
#else
#define vfmlal_lane_high_u32(__p0_267, __p1_267, __p2_267, __p3_267) __extension__ ({ \
#define vfmlal_lane_high_f16(__p0_267, __p1_267, __p2_267, __p3_267) __extension__ ({ \
float32x2_t __s0_267 = __p0_267; \
float16x4_t __s1_267 = __p1_267; \
float16x4_t __s2_267 = __p2_267; \
@@ -71953,23 +71953,23 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in
float16x4_t __rev1_267; __rev1_267 = __builtin_shufflevector(__s1_267, __s1_267, 3, 2, 1, 0); \
float16x4_t __rev2_267; __rev2_267 = __builtin_shufflevector(__s2_267, __s2_267, 3, 2, 1, 0); \
float32x2_t __ret_267; \
__ret_267 = __noswap_vfmlal_high_u32(__rev0_267, __rev1_267, (float16x4_t) {__noswap_vget_lane_f16(__rev2_267, __p3_267), __noswap_vget_lane_f16(__rev2_267, __p3_267), __noswap_vget_lane_f16(__rev2_267, __p3_267), __noswap_vget_lane_f16(__rev2_267, __p3_267)}); \
__ret_267 = __noswap_vfmlal_high_f16(__rev0_267, __rev1_267, (float16x4_t) {__noswap_vget_lane_f16(__rev2_267, __p3_267), __noswap_vget_lane_f16(__rev2_267, __p3_267), __noswap_vget_lane_f16(__rev2_267, __p3_267), __noswap_vget_lane_f16(__rev2_267, __p3_267)}); \
__ret_267 = __builtin_shufflevector(__ret_267, __ret_267, 1, 0); \
__ret_267; \
})
#endif
#ifdef __LITTLE_ENDIAN__
#define vfmlalq_lane_low_u32(__p0_268, __p1_268, __p2_268, __p3_268) __extension__ ({ \
#define vfmlalq_lane_low_f16(__p0_268, __p1_268, __p2_268, __p3_268) __extension__ ({ \
float32x4_t __s0_268 = __p0_268; \
float16x8_t __s1_268 = __p1_268; \
float16x4_t __s2_268 = __p2_268; \
float32x4_t __ret_268; \
__ret_268 = vfmlalq_low_u32(__s0_268, __s1_268, (float16x8_t) {vget_lane_f16(__s2_268, __p3_268), vget_lane_f16(__s2_268, __p3_268), vget_lane_f16(__s2_268, __p3_268), vget_lane_f16(__s2_268, __p3_268), vget_lane_f16(__s2_268, __p3_268), vget_lane_f16(__s2_268, __p3_268), vget_lane_f16(__s2_268, __p3_268), vget_lane_f16(__s2_268, __p3_268)}); \
__ret_268 = vfmlalq_low_f16(__s0_268, __s1_268, (float16x8_t) {vget_lane_f16(__s2_268, __p3_268), vget_lane_f16(__s2_268, __p3_268), vget_lane_f16(__s2_268, __p3_268), vget_lane_f16(__s2_268, __p3_268), vget_lane_f16(__s2_268, __p3_268), vget_lane_f16(__s2_268, __p3_268), vget_lane_f16(__s2_268, __p3_268), vget_lane_f16(__s2_268, __p3_268)}); \
__ret_268; \
})
#else
#define vfmlalq_lane_low_u32(__p0_269, __p1_269, __p2_269, __p3_269) __extension__ ({ \
#define vfmlalq_lane_low_f16(__p0_269, __p1_269, __p2_269, __p3_269) __extension__ ({ \
float32x4_t __s0_269 = __p0_269; \
float16x8_t __s1_269 = __p1_269; \
float16x4_t __s2_269 = __p2_269; \
@@ -71977,23 +71977,23 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in
float16x8_t __rev1_269; __rev1_269 = __builtin_shufflevector(__s1_269, __s1_269, 7, 6, 5, 4, 3, 2, 1, 0); \
float16x4_t __rev2_269; __rev2_269 = __builtin_shufflevector(__s2_269, __s2_269, 3, 2, 1, 0); \
float32x4_t __ret_269; \
__ret_269 = __noswap_vfmlalq_low_u32(__rev0_269, __rev1_269, (float16x8_t) {__noswap_vget_lane_f16(__rev2_269, __p3_269), __noswap_vget_lane_f16(__rev2_269, __p3_269), __noswap_vget_lane_f16(__rev2_269, __p3_269), __noswap_vget_lane_f16(__rev2_269, __p3_269), __noswap_vget_lane_f16(__rev2_269, __p3_269), __noswap_vget_lane_f16(__rev2_269, __p3_269), __noswap_vget_lane_f16(__rev2_269, __p3_269), __noswap_vget_lane_f16(__rev2_269, __p3_269)}); \
__ret_269 = __noswap_vfmlalq_low_f16(__rev0_269, __rev1_269, (float16x8_t) {__noswap_vget_lane_f16(__rev2_269, __p3_269), __noswap_vget_lane_f16(__rev2_269, __p3_269), __noswap_vget_lane_f16(__rev2_269, __p3_269), __noswap_vget_lane_f16(__rev2_269, __p3_269), __noswap_vget_lane_f16(__rev2_269, __p3_269), __noswap_vget_lane_f16(__rev2_269, __p3_269), __noswap_vget_lane_f16(__rev2_269, __p3_269), __noswap_vget_lane_f16(__rev2_269, __p3_269)}); \
__ret_269 = __builtin_shufflevector(__ret_269, __ret_269, 3, 2, 1, 0); \
__ret_269; \
})
#endif
#ifdef __LITTLE_ENDIAN__
#define vfmlal_lane_low_u32(__p0_270, __p1_270, __p2_270, __p3_270) __extension__ ({ \
#define vfmlal_lane_low_f16(__p0_270, __p1_270, __p2_270, __p3_270) __extension__ ({ \
float32x2_t __s0_270 = __p0_270; \
float16x4_t __s1_270 = __p1_270; \
float16x4_t __s2_270 = __p2_270; \
float32x2_t __ret_270; \
__ret_270 = vfmlal_low_u32(__s0_270, __s1_270, (float16x4_t) {vget_lane_f16(__s2_270, __p3_270), vget_lane_f16(__s2_270, __p3_270), vget_lane_f16(__s2_270, __p3_270), vget_lane_f16(__s2_270, __p3_270)}); \
__ret_270 = vfmlal_low_f16(__s0_270, __s1_270, (float16x4_t) {vget_lane_f16(__s2_270, __p3_270), vget_lane_f16(__s2_270, __p3_270), vget_lane_f16(__s2_270, __p3_270), vget_lane_f16(__s2_270, __p3_270)}); \
__ret_270; \
})
#else
#define vfmlal_lane_low_u32(__p0_271, __p1_271, __p2_271, __p3_271) __extension__ ({ \
#define vfmlal_lane_low_f16(__p0_271, __p1_271, __p2_271, __p3_271) __extension__ ({ \
float32x2_t __s0_271 = __p0_271; \
float16x4_t __s1_271 = __p1_271; \
float16x4_t __s2_271 = __p2_271; \
@@ -72001,23 +72001,23 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in
float16x4_t __rev1_271; __rev1_271 = __builtin_shufflevector(__s1_271, __s1_271, 3, 2, 1, 0); \
float16x4_t __rev2_271; __rev2_271 = __builtin_shufflevector(__s2_271, __s2_271, 3, 2, 1, 0); \
float32x2_t __ret_271; \
__ret_271 = __noswap_vfmlal_low_u32(__rev0_271, __rev1_271, (float16x4_t) {__noswap_vget_lane_f16(__rev2_271, __p3_271), __noswap_vget_lane_f16(__rev2_271, __p3_271), __noswap_vget_lane_f16(__rev2_271, __p3_271), __noswap_vget_lane_f16(__rev2_271, __p3_271)}); \
__ret_271 = __noswap_vfmlal_low_f16(__rev0_271, __rev1_271, (float16x4_t) {__noswap_vget_lane_f16(__rev2_271, __p3_271), __noswap_vget_lane_f16(__rev2_271, __p3_271), __noswap_vget_lane_f16(__rev2_271, __p3_271), __noswap_vget_lane_f16(__rev2_271, __p3_271)}); \
__ret_271 = __builtin_shufflevector(__ret_271, __ret_271, 1, 0); \
__ret_271; \
})
#endif
#ifdef __LITTLE_ENDIAN__
#define vfmlalq_laneq_high_u32(__p0_272, __p1_272, __p2_272, __p3_272) __extension__ ({ \
#define vfmlalq_laneq_high_f16(__p0_272, __p1_272, __p2_272, __p3_272) __extension__ ({ \
float32x4_t __s0_272 = __p0_272; \
float16x8_t __s1_272 = __p1_272; \
float16x8_t __s2_272 = __p2_272; \
float32x4_t __ret_272; \
__ret_272 = vfmlalq_high_u32(__s0_272, __s1_272, (float16x8_t) {vgetq_lane_f16(__s2_272, __p3_272), vgetq_lane_f16(__s2_272, __p3_272), vgetq_lane_f16(__s2_272, __p3_272), vgetq_lane_f16(__s2_272, __p3_272), vgetq_lane_f16(__s2_272, __p3_272), vgetq_lane_f16(__s2_272, __p3_272), vgetq_lane_f16(__s2_272, __p3_272), vgetq_lane_f16(__s2_272, __p3_272)}); \
__ret_272 = vfmlalq_high_f16(__s0_272, __s1_272, (float16x8_t) {vgetq_lane_f16(__s2_272, __p3_272), vgetq_lane_f16(__s2_272, __p3_272), vgetq_lane_f16(__s2_272, __p3_272), vgetq_lane_f16(__s2_272, __p3_272), vgetq_lane_f16(__s2_272, __p3_272), vgetq_lane_f16(__s2_272, __p3_272), vgetq_lane_f16(__s2_272, __p3_272), vgetq_lane_f16(__s2_272, __p3_272)}); \
__ret_272; \
})
#else
#define vfmlalq_laneq_high_u32(__p0_273, __p1_273, __p2_273, __p3_273) __extension__ ({ \
#define vfmlalq_laneq_high_f16(__p0_273, __p1_273, __p2_273, __p3_273) __extension__ ({ \
float32x4_t __s0_273 = __p0_273; \
float16x8_t __s1_273 = __p1_273; \
float16x8_t __s2_273 = __p2_273; \
@@ -72025,23 +72025,23 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in
float16x8_t __rev1_273; __rev1_273 = __builtin_shufflevector(__s1_273, __s1_273, 7, 6, 5, 4, 3, 2, 1, 0); \
float16x8_t __rev2_273; __rev2_273 = __builtin_shufflevector(__s2_273, __s2_273, 7, 6, 5, 4, 3, 2, 1, 0); \
float32x4_t __ret_273; \
__ret_273 = __noswap_vfmlalq_high_u32(__rev0_273, __rev1_273, (float16x8_t) {__noswap_vgetq_lane_f16(__rev2_273, __p3_273), __noswap_vgetq_lane_f16(__rev2_273, __p3_273), __noswap_vgetq_lane_f16(__rev2_273, __p3_273), __noswap_vgetq_lane_f16(__rev2_273, __p3_273), __noswap_vgetq_lane_f16(__rev2_273, __p3_273), __noswap_vgetq_lane_f16(__rev2_273, __p3_273), __noswap_vgetq_lane_f16(__rev2_273, __p3_273), __noswap_vgetq_lane_f16(__rev2_273, __p3_273)}); \
__ret_273 = __noswap_vfmlalq_high_f16(__rev0_273, __rev1_273, (float16x8_t) {__noswap_vgetq_lane_f16(__rev2_273, __p3_273), __noswap_vgetq_lane_f16(__rev2_273, __p3_273), __noswap_vgetq_lane_f16(__rev2_273, __p3_273), __noswap_vgetq_lane_f16(__rev2_273, __p3_273), __noswap_vgetq_lane_f16(__rev2_273, __p3_273), __noswap_vgetq_lane_f16(__rev2_273, __p3_273), __noswap_vgetq_lane_f16(__rev2_273, __p3_273), __noswap_vgetq_lane_f16(__rev2_273, __p3_273)}); \
__ret_273 = __builtin_shufflevector(__ret_273, __ret_273, 3, 2, 1, 0); \
__ret_273; \
})
#endif
#ifdef __LITTLE_ENDIAN__
#define vfmlal_laneq_high_u32(__p0_274, __p1_274, __p2_274, __p3_274) __extension__ ({ \
#define vfmlal_laneq_high_f16(__p0_274, __p1_274, __p2_274, __p3_274) __extension__ ({ \
float32x2_t __s0_274 = __p0_274; \
float16x4_t __s1_274 = __p1_274; \
float16x8_t __s2_274 = __p2_274; \
float32x2_t __ret_274; \
__ret_274 = vfmlal_high_u32(__s0_274, __s1_274, (float16x4_t) {vgetq_lane_f16(__s2_274, __p3_274), vgetq_lane_f16(__s2_274, __p3_274), vgetq_lane_f16(__s2_274, __p3_274), vgetq_lane_f16(__s2_274, __p3_274)}); \
__ret_274 = vfmlal_high_f16(__s0_274, __s1_274, (float16x4_t) {vgetq_lane_f16(__s2_274, __p3_274), vgetq_lane_f16(__s2_274, __p3_274), vgetq_lane_f16(__s2_274, __p3_274), vgetq_lane_f16(__s2_274, __p3_274)}); \
__ret_274; \
})
#else
#define vfmlal_laneq_high_u32(__p0_275, __p1_275, __p2_275, __p3_275) __extension__ ({ \
#define vfmlal_laneq_high_f16(__p0_275, __p1_275, __p2_275, __p3_275) __extension__ ({ \
float32x2_t __s0_275 = __p0_275; \
float16x4_t __s1_275 = __p1_275; \
float16x8_t __s2_275 = __p2_275; \
@@ -72049,23 +72049,23 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in
float16x4_t __rev1_275; __rev1_275 = __builtin_shufflevector(__s1_275, __s1_275, 3, 2, 1, 0); \
float16x8_t __rev2_275; __rev2_275 = __builtin_shufflevector(__s2_275, __s2_275, 7, 6, 5, 4, 3, 2, 1, 0); \
float32x2_t __ret_275; \
__ret_275 = __noswap_vfmlal_high_u32(__rev0_275, __rev1_275, (float16x4_t) {__noswap_vgetq_lane_f16(__rev2_275, __p3_275), __noswap_vgetq_lane_f16(__rev2_275, __p3_275), __noswap_vgetq_lane_f16(__rev2_275, __p3_275), __noswap_vgetq_lane_f16(__rev2_275, __p3_275)}); \
__ret_275 = __noswap_vfmlal_high_f16(__rev0_275, __rev1_275, (float16x4_t) {__noswap_vgetq_lane_f16(__rev2_275, __p3_275), __noswap_vgetq_lane_f16(__rev2_275, __p3_275), __noswap_vgetq_lane_f16(__rev2_275, __p3_275), __noswap_vgetq_lane_f16(__rev2_275, __p3_275)}); \
__ret_275 = __builtin_shufflevector(__ret_275, __ret_275, 1, 0); \
__ret_275; \
})
#endif
#ifdef __LITTLE_ENDIAN__
#define vfmlalq_laneq_low_u32(__p0_276, __p1_276, __p2_276, __p3_276) __extension__ ({ \
#define vfmlalq_laneq_low_f16(__p0_276, __p1_276, __p2_276, __p3_276) __extension__ ({ \
float32x4_t __s0_276 = __p0_276; \
float16x8_t __s1_276 = __p1_276; \
float16x8_t __s2_276 = __p2_276; \
float32x4_t __ret_276; \
__ret_276 = vfmlalq_low_u32(__s0_276, __s1_276, (float16x8_t) {vgetq_lane_f16(__s2_276, __p3_276), vgetq_lane_f16(__s2_276, __p3_276), vgetq_lane_f16(__s2_276, __p3_276), vgetq_lane_f16(__s2_276, __p3_276), vgetq_lane_f16(__s2_276, __p3_276), vgetq_lane_f16(__s2_276, __p3_276), vgetq_lane_f16(__s2_276, __p3_276), vgetq_lane_f16(__s2_276, __p3_276)}); \
__ret_276 = vfmlalq_low_f16(__s0_276, __s1_276, (float16x8_t) {vgetq_lane_f16(__s2_276, __p3_276), vgetq_lane_f16(__s2_276, __p3_276), vgetq_lane_f16(__s2_276, __p3_276), vgetq_lane_f16(__s2_276, __p3_276), vgetq_lane_f16(__s2_276, __p3_276), vgetq_lane_f16(__s2_276, __p3_276), vgetq_lane_f16(__s2_276, __p3_276), vgetq_lane_f16(__s2_276, __p3_276)}); \
__ret_276; \
})
#else
#define vfmlalq_laneq_low_u32(__p0_277, __p1_277, __p2_277, __p3_277) __extension__ ({ \
#define vfmlalq_laneq_low_f16(__p0_277, __p1_277, __p2_277, __p3_277) __extension__ ({ \
float32x4_t __s0_277 = __p0_277; \
float16x8_t __s1_277 = __p1_277; \
float16x8_t __s2_277 = __p2_277; \
@@ -72073,23 +72073,23 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in
float16x8_t __rev1_277; __rev1_277 = __builtin_shufflevector(__s1_277, __s1_277, 7, 6, 5, 4, 3, 2, 1, 0); \
float16x8_t __rev2_277; __rev2_277 = __builtin_shufflevector(__s2_277, __s2_277, 7, 6, 5, 4, 3, 2, 1, 0); \
float32x4_t __ret_277; \
__ret_277 = __noswap_vfmlalq_low_u32(__rev0_277, __rev1_277, (float16x8_t) {__noswap_vgetq_lane_f16(__rev2_277, __p3_277), __noswap_vgetq_lane_f16(__rev2_277, __p3_277), __noswap_vgetq_lane_f16(__rev2_277, __p3_277), __noswap_vgetq_lane_f16(__rev2_277, __p3_277), __noswap_vgetq_lane_f16(__rev2_277, __p3_277), __noswap_vgetq_lane_f16(__rev2_277, __p3_277), __noswap_vgetq_lane_f16(__rev2_277, __p3_277), __noswap_vgetq_lane_f16(__rev2_277, __p3_277)}); \
__ret_277 = __noswap_vfmlalq_low_f16(__rev0_277, __rev1_277, (float16x8_t) {__noswap_vgetq_lane_f16(__rev2_277, __p3_277), __noswap_vgetq_lane_f16(__rev2_277, __p3_277), __noswap_vgetq_lane_f16(__rev2_277, __p3_277), __noswap_vgetq_lane_f16(__rev2_277, __p3_277), __noswap_vgetq_lane_f16(__rev2_277, __p3_277), __noswap_vgetq_lane_f16(__rev2_277, __p3_277), __noswap_vgetq_lane_f16(__rev2_277, __p3_277), __noswap_vgetq_lane_f16(__rev2_277, __p3_277)}); \
__ret_277 = __builtin_shufflevector(__ret_277, __ret_277, 3, 2, 1, 0); \
__ret_277; \
})
#endif
#ifdef __LITTLE_ENDIAN__
#define vfmlal_laneq_low_u32(__p0_278, __p1_278, __p2_278, __p3_278) __extension__ ({ \
#define vfmlal_laneq_low_f16(__p0_278, __p1_278, __p2_278, __p3_278) __extension__ ({ \
float32x2_t __s0_278 = __p0_278; \
float16x4_t __s1_278 = __p1_278; \
float16x8_t __s2_278 = __p2_278; \
float32x2_t __ret_278; \
__ret_278 = vfmlal_low_u32(__s0_278, __s1_278, (float16x4_t) {vgetq_lane_f16(__s2_278, __p3_278), vgetq_lane_f16(__s2_278, __p3_278), vgetq_lane_f16(__s2_278, __p3_278), vgetq_lane_f16(__s2_278, __p3_278)}); \
__ret_278 = vfmlal_low_f16(__s0_278, __s1_278, (float16x4_t) {vgetq_lane_f16(__s2_278, __p3_278), vgetq_lane_f16(__s2_278, __p3_278), vgetq_lane_f16(__s2_278, __p3_278), vgetq_lane_f16(__s2_278, __p3_278)}); \
__ret_278; \
})
#else
#define vfmlal_laneq_low_u32(__p0_279, __p1_279, __p2_279, __p3_279) __extension__ ({ \
#define vfmlal_laneq_low_f16(__p0_279, __p1_279, __p2_279, __p3_279) __extension__ ({ \
float32x2_t __s0_279 = __p0_279; \
float16x4_t __s1_279 = __p1_279; \
float16x8_t __s2_279 = __p2_279; \
@@ -72097,23 +72097,23 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in
float16x4_t __rev1_279; __rev1_279 = __builtin_shufflevector(__s1_279, __s1_279, 3, 2, 1, 0); \
float16x8_t __rev2_279; __rev2_279 = __builtin_shufflevector(__s2_279, __s2_279, 7, 6, 5, 4, 3, 2, 1, 0); \
float32x2_t __ret_279; \
__ret_279 = __noswap_vfmlal_low_u32(__rev0_279, __rev1_279, (float16x4_t) {__noswap_vgetq_lane_f16(__rev2_279, __p3_279), __noswap_vgetq_lane_f16(__rev2_279, __p3_279), __noswap_vgetq_lane_f16(__rev2_279, __p3_279), __noswap_vgetq_lane_f16(__rev2_279, __p3_279)}); \
__ret_279 = __noswap_vfmlal_low_f16(__rev0_279, __rev1_279, (float16x4_t) {__noswap_vgetq_lane_f16(__rev2_279, __p3_279), __noswap_vgetq_lane_f16(__rev2_279, __p3_279), __noswap_vgetq_lane_f16(__rev2_279, __p3_279), __noswap_vgetq_lane_f16(__rev2_279, __p3_279)}); \
__ret_279 = __builtin_shufflevector(__ret_279, __ret_279, 1, 0); \
__ret_279; \
})
#endif
#ifdef __LITTLE_ENDIAN__
#define vfmlslq_lane_high_u32(__p0_280, __p1_280, __p2_280, __p3_280) __extension__ ({ \
#define vfmlslq_lane_high_f16(__p0_280, __p1_280, __p2_280, __p3_280) __extension__ ({ \
float32x4_t __s0_280 = __p0_280; \
float16x8_t __s1_280 = __p1_280; \
float16x4_t __s2_280 = __p2_280; \
float32x4_t __ret_280; \
__ret_280 = vfmlslq_high_u32(__s0_280, __s1_280, (float16x8_t) {vget_lane_f16(__s2_280, __p3_280), vget_lane_f16(__s2_280, __p3_280), vget_lane_f16(__s2_280, __p3_280), vget_lane_f16(__s2_280, __p3_280), vget_lane_f16(__s2_280, __p3_280), vget_lane_f16(__s2_280, __p3_280), vget_lane_f16(__s2_280, __p3_280), vget_lane_f16(__s2_280, __p3_280)}); \
__ret_280 = vfmlslq_high_f16(__s0_280, __s1_280, (float16x8_t) {vget_lane_f16(__s2_280, __p3_280), vget_lane_f16(__s2_280, __p3_280), vget_lane_f16(__s2_280, __p3_280), vget_lane_f16(__s2_280, __p3_280), vget_lane_f16(__s2_280, __p3_280), vget_lane_f16(__s2_280, __p3_280), vget_lane_f16(__s2_280, __p3_280), vget_lane_f16(__s2_280, __p3_280)}); \
__ret_280; \
})
#else
#define vfmlslq_lane_high_u32(__p0_281, __p1_281, __p2_281, __p3_281) __extension__ ({ \
#define vfmlslq_lane_high_f16(__p0_281, __p1_281, __p2_281, __p3_281) __extension__ ({ \
float32x4_t __s0_281 = __p0_281; \
float16x8_t __s1_281 = __p1_281; \
float16x4_t __s2_281 = __p2_281; \
@@ -72121,23 +72121,23 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in
float16x8_t __rev1_281; __rev1_281 = __builtin_shufflevector(__s1_281, __s1_281, 7, 6, 5, 4, 3, 2, 1, 0); \
float16x4_t __rev2_281; __rev2_281 = __builtin_shufflevector(__s2_281, __s2_281, 3, 2, 1, 0); \
float32x4_t __ret_281; \
__ret_281 = __noswap_vfmlslq_high_u32(__rev0_281, __rev1_281, (float16x8_t) {__noswap_vget_lane_f16(__rev2_281, __p3_281), __noswap_vget_lane_f16(__rev2_281, __p3_281), __noswap_vget_lane_f16(__rev2_281, __p3_281), __noswap_vget_lane_f16(__rev2_281, __p3_281), __noswap_vget_lane_f16(__rev2_281, __p3_281), __noswap_vget_lane_f16(__rev2_281, __p3_281), __noswap_vget_lane_f16(__rev2_281, __p3_281), __noswap_vget_lane_f16(__rev2_281, __p3_281)}); \
__ret_281 = __noswap_vfmlslq_high_f16(__rev0_281, __rev1_281, (float16x8_t) {__noswap_vget_lane_f16(__rev2_281, __p3_281), __noswap_vget_lane_f16(__rev2_281, __p3_281), __noswap_vget_lane_f16(__rev2_281, __p3_281), __noswap_vget_lane_f16(__rev2_281, __p3_281), __noswap_vget_lane_f16(__rev2_281, __p3_281), __noswap_vget_lane_f16(__rev2_281, __p3_281), __noswap_vget_lane_f16(__rev2_281, __p3_281), __noswap_vget_lane_f16(__rev2_281, __p3_281)}); \
__ret_281 = __builtin_shufflevector(__ret_281, __ret_281, 3, 2, 1, 0); \
__ret_281; \
})
#endif
#ifdef __LITTLE_ENDIAN__
#define vfmlsl_lane_high_u32(__p0_282, __p1_282, __p2_282, __p3_282) __extension__ ({ \
#define vfmlsl_lane_high_f16(__p0_282, __p1_282, __p2_282, __p3_282) __extension__ ({ \
float32x2_t __s0_282 = __p0_282; \
float16x4_t __s1_282 = __p1_282; \
float16x4_t __s2_282 = __p2_282; \
float32x2_t __ret_282; \
__ret_282 = vfmlsl_high_u32(__s0_282, __s1_282, (float16x4_t) {vget_lane_f16(__s2_282, __p3_282), vget_lane_f16(__s2_282, __p3_282), vget_lane_f16(__s2_282, __p3_282), vget_lane_f16(__s2_282, __p3_282)}); \
__ret_282 = vfmlsl_high_f16(__s0_282, __s1_282, (float16x4_t) {vget_lane_f16(__s2_282, __p3_282), vget_lane_f16(__s2_282, __p3_282), vget_lane_f16(__s2_282, __p3_282), vget_lane_f16(__s2_282, __p3_282)}); \
__ret_282; \
})
#else
#define vfmlsl_lane_high_u32(__p0_283, __p1_283, __p2_283, __p3_283) __extension__ ({ \
#define vfmlsl_lane_high_f16(__p0_283, __p1_283, __p2_283, __p3_283) __extension__ ({ \
float32x2_t __s0_283 = __p0_283; \
float16x4_t __s1_283 = __p1_283; \
float16x4_t __s2_283 = __p2_283; \
@@ -72145,23 +72145,23 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in
float16x4_t __rev1_283; __rev1_283 = __builtin_shufflevector(__s1_283, __s1_283, 3, 2, 1, 0); \
float16x4_t __rev2_283; __rev2_283 = __builtin_shufflevector(__s2_283, __s2_283, 3, 2, 1, 0); \
float32x2_t __ret_283; \
__ret_283 = __noswap_vfmlsl_high_u32(__rev0_283, __rev1_283, (float16x4_t) {__noswap_vget_lane_f16(__rev2_283, __p3_283), __noswap_vget_lane_f16(__rev2_283, __p3_283), __noswap_vget_lane_f16(__rev2_283, __p3_283), __noswap_vget_lane_f16(__rev2_283, __p3_283)}); \
__ret_283 = __noswap_vfmlsl_high_f16(__rev0_283, __rev1_283, (float16x4_t) {__noswap_vget_lane_f16(__rev2_283, __p3_283), __noswap_vget_lane_f16(__rev2_283, __p3_283), __noswap_vget_lane_f16(__rev2_283, __p3_283), __noswap_vget_lane_f16(__rev2_283, __p3_283)}); \
__ret_283 = __builtin_shufflevector(__ret_283, __ret_283, 1, 0); \
__ret_283; \
})
#endif
#ifdef __LITTLE_ENDIAN__
#define vfmlslq_lane_low_u32(__p0_284, __p1_284, __p2_284, __p3_284) __extension__ ({ \
#define vfmlslq_lane_low_f16(__p0_284, __p1_284, __p2_284, __p3_284) __extension__ ({ \
float32x4_t __s0_284 = __p0_284; \
float16x8_t __s1_284 = __p1_284; \
float16x4_t __s2_284 = __p2_284; \
float32x4_t __ret_284; \
__ret_284 = vfmlslq_low_u32(__s0_284, __s1_284, (float16x8_t) {vget_lane_f16(__s2_284, __p3_284), vget_lane_f16(__s2_284, __p3_284), vget_lane_f16(__s2_284, __p3_284), vget_lane_f16(__s2_284, __p3_284), vget_lane_f16(__s2_284, __p3_284), vget_lane_f16(__s2_284, __p3_284), vget_lane_f16(__s2_284, __p3_284), vget_lane_f16(__s2_284, __p3_284)}); \
__ret_284 = vfmlslq_low_f16(__s0_284, __s1_284, (float16x8_t) {vget_lane_f16(__s2_284, __p3_284), vget_lane_f16(__s2_284, __p3_284), vget_lane_f16(__s2_284, __p3_284), vget_lane_f16(__s2_284, __p3_284), vget_lane_f16(__s2_284, __p3_284), vget_lane_f16(__s2_284, __p3_284), vget_lane_f16(__s2_284, __p3_284), vget_lane_f16(__s2_284, __p3_284)}); \
__ret_284; \
})
#else
#define vfmlslq_lane_low_u32(__p0_285, __p1_285, __p2_285, __p3_285) __extension__ ({ \
#define vfmlslq_lane_low_f16(__p0_285, __p1_285, __p2_285, __p3_285) __extension__ ({ \
float32x4_t __s0_285 = __p0_285; \
float16x8_t __s1_285 = __p1_285; \
float16x4_t __s2_285 = __p2_285; \
@@ -72169,23 +72169,23 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in
float16x8_t __rev1_285; __rev1_285 = __builtin_shufflevector(__s1_285, __s1_285, 7, 6, 5, 4, 3, 2, 1, 0); \
float16x4_t __rev2_285; __rev2_285 = __builtin_shufflevector(__s2_285, __s2_285, 3, 2, 1, 0); \
float32x4_t __ret_285; \
__ret_285 = __noswap_vfmlslq_low_u32(__rev0_285, __rev1_285, (float16x8_t) {__noswap_vget_lane_f16(__rev2_285, __p3_285), __noswap_vget_lane_f16(__rev2_285, __p3_285), __noswap_vget_lane_f16(__rev2_285, __p3_285), __noswap_vget_lane_f16(__rev2_285, __p3_285), __noswap_vget_lane_f16(__rev2_285, __p3_285), __noswap_vget_lane_f16(__rev2_285, __p3_285), __noswap_vget_lane_f16(__rev2_285, __p3_285), __noswap_vget_lane_f16(__rev2_285, __p3_285)}); \
__ret_285 = __noswap_vfmlslq_low_f16(__rev0_285, __rev1_285, (float16x8_t) {__noswap_vget_lane_f16(__rev2_285, __p3_285), __noswap_vget_lane_f16(__rev2_285, __p3_285), __noswap_vget_lane_f16(__rev2_285, __p3_285), __noswap_vget_lane_f16(__rev2_285, __p3_285), __noswap_vget_lane_f16(__rev2_285, __p3_285), __noswap_vget_lane_f16(__rev2_285, __p3_285), __noswap_vget_lane_f16(__rev2_285, __p3_285), __noswap_vget_lane_f16(__rev2_285, __p3_285)}); \
__ret_285 = __builtin_shufflevector(__ret_285, __ret_285, 3, 2, 1, 0); \
__ret_285; \
})
#endif
#ifdef __LITTLE_ENDIAN__
#define vfmlsl_lane_low_u32(__p0_286, __p1_286, __p2_286, __p3_286) __extension__ ({ \
#define vfmlsl_lane_low_f16(__p0_286, __p1_286, __p2_286, __p3_286) __extension__ ({ \
float32x2_t __s0_286 = __p0_286; \
float16x4_t __s1_286 = __p1_286; \
float16x4_t __s2_286 = __p2_286; \
float32x2_t __ret_286; \
__ret_286 = vfmlsl_low_u32(__s0_286, __s1_286, (float16x4_t) {vget_lane_f16(__s2_286, __p3_286), vget_lane_f16(__s2_286, __p3_286), vget_lane_f16(__s2_286, __p3_286), vget_lane_f16(__s2_286, __p3_286)}); \
__ret_286 = vfmlsl_low_f16(__s0_286, __s1_286, (float16x4_t) {vget_lane_f16(__s2_286, __p3_286), vget_lane_f16(__s2_286, __p3_286), vget_lane_f16(__s2_286, __p3_286), vget_lane_f16(__s2_286, __p3_286)}); \
__ret_286; \
})
#else
#define vfmlsl_lane_low_u32(__p0_287, __p1_287, __p2_287, __p3_287) __extension__ ({ \
#define vfmlsl_lane_low_f16(__p0_287, __p1_287, __p2_287, __p3_287) __extension__ ({ \
float32x2_t __s0_287 = __p0_287; \
float16x4_t __s1_287 = __p1_287; \
float16x4_t __s2_287 = __p2_287; \
@@ -72193,23 +72193,23 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in
float16x4_t __rev1_287; __rev1_287 = __builtin_shufflevector(__s1_287, __s1_287, 3, 2, 1, 0); \
float16x4_t __rev2_287; __rev2_287 = __builtin_shufflevector(__s2_287, __s2_287, 3, 2, 1, 0); \
float32x2_t __ret_287; \
__ret_287 = __noswap_vfmlsl_low_u32(__rev0_287, __rev1_287, (float16x4_t) {__noswap_vget_lane_f16(__rev2_287, __p3_287), __noswap_vget_lane_f16(__rev2_287, __p3_287), __noswap_vget_lane_f16(__rev2_287, __p3_287), __noswap_vget_lane_f16(__rev2_287, __p3_287)}); \
__ret_287 = __noswap_vfmlsl_low_f16(__rev0_287, __rev1_287, (float16x4_t) {__noswap_vget_lane_f16(__rev2_287, __p3_287), __noswap_vget_lane_f16(__rev2_287, __p3_287), __noswap_vget_lane_f16(__rev2_287, __p3_287), __noswap_vget_lane_f16(__rev2_287, __p3_287)}); \
__ret_287 = __builtin_shufflevector(__ret_287, __ret_287, 1, 0); \
__ret_287; \
})
#endif
#ifdef __LITTLE_ENDIAN__
#define vfmlslq_laneq_high_u32(__p0_288, __p1_288, __p2_288, __p3_288) __extension__ ({ \
#define vfmlslq_laneq_high_f16(__p0_288, __p1_288, __p2_288, __p3_288) __extension__ ({ \
float32x4_t __s0_288 = __p0_288; \
float16x8_t __s1_288 = __p1_288; \
float16x8_t __s2_288 = __p2_288; \
float32x4_t __ret_288; \
__ret_288 = vfmlslq_high_u32(__s0_288, __s1_288, (float16x8_t) {vgetq_lane_f16(__s2_288, __p3_288), vgetq_lane_f16(__s2_288, __p3_288), vgetq_lane_f16(__s2_288, __p3_288), vgetq_lane_f16(__s2_288, __p3_288), vgetq_lane_f16(__s2_288, __p3_288), vgetq_lane_f16(__s2_288, __p3_288), vgetq_lane_f16(__s2_288, __p3_288), vgetq_lane_f16(__s2_288, __p3_288)}); \
__ret_288 = vfmlslq_high_f16(__s0_288, __s1_288, (float16x8_t) {vgetq_lane_f16(__s2_288, __p3_288), vgetq_lane_f16(__s2_288, __p3_288), vgetq_lane_f16(__s2_288, __p3_288), vgetq_lane_f16(__s2_288, __p3_288), vgetq_lane_f16(__s2_288, __p3_288), vgetq_lane_f16(__s2_288, __p3_288), vgetq_lane_f16(__s2_288, __p3_288), vgetq_lane_f16(__s2_288, __p3_288)}); \
__ret_288; \
})
#else
#define vfmlslq_laneq_high_u32(__p0_289, __p1_289, __p2_289, __p3_289) __extension__ ({ \
#define vfmlslq_laneq_high_f16(__p0_289, __p1_289, __p2_289, __p3_289) __extension__ ({ \
float32x4_t __s0_289 = __p0_289; \
float16x8_t __s1_289 = __p1_289; \
float16x8_t __s2_289 = __p2_289; \
@@ -72217,23 +72217,23 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in
float16x8_t __rev1_289; __rev1_289 = __builtin_shufflevector(__s1_289, __s1_289, 7, 6, 5, 4, 3, 2, 1, 0); \
float16x8_t __rev2_289; __rev2_289 = __builtin_shufflevector(__s2_289, __s2_289, 7, 6, 5, 4, 3, 2, 1, 0); \
float32x4_t __ret_289; \
__ret_289 = __noswap_vfmlslq_high_u32(__rev0_289, __rev1_289, (float16x8_t) {__noswap_vgetq_lane_f16(__rev2_289, __p3_289), __noswap_vgetq_lane_f16(__rev2_289, __p3_289), __noswap_vgetq_lane_f16(__rev2_289, __p3_289), __noswap_vgetq_lane_f16(__rev2_289, __p3_289), __noswap_vgetq_lane_f16(__rev2_289, __p3_289), __noswap_vgetq_lane_f16(__rev2_289, __p3_289), __noswap_vgetq_lane_f16(__rev2_289, __p3_289), __noswap_vgetq_lane_f16(__rev2_289, __p3_289)}); \
__ret_289 = __noswap_vfmlslq_high_f16(__rev0_289, __rev1_289, (float16x8_t) {__noswap_vgetq_lane_f16(__rev2_289, __p3_289), __noswap_vgetq_lane_f16(__rev2_289, __p3_289), __noswap_vgetq_lane_f16(__rev2_289, __p3_289), __noswap_vgetq_lane_f16(__rev2_289, __p3_289), __noswap_vgetq_lane_f16(__rev2_289, __p3_289), __noswap_vgetq_lane_f16(__rev2_289, __p3_289), __noswap_vgetq_lane_f16(__rev2_289, __p3_289), __noswap_vgetq_lane_f16(__rev2_289, __p3_289)}); \
__ret_289 = __builtin_shufflevector(__ret_289, __ret_289, 3, 2, 1, 0); \
__ret_289; \
})
#endif
#ifdef __LITTLE_ENDIAN__
#define vfmlsl_laneq_high_u32(__p0_290, __p1_290, __p2_290, __p3_290) __extension__ ({ \
#define vfmlsl_laneq_high_f16(__p0_290, __p1_290, __p2_290, __p3_290) __extension__ ({ \
float32x2_t __s0_290 = __p0_290; \
float16x4_t __s1_290 = __p1_290; \
float16x8_t __s2_290 = __p2_290; \
float32x2_t __ret_290; \
__ret_290 = vfmlsl_high_u32(__s0_290, __s1_290, (float16x4_t) {vgetq_lane_f16(__s2_290, __p3_290), vgetq_lane_f16(__s2_290, __p3_290), vgetq_lane_f16(__s2_290, __p3_290), vgetq_lane_f16(__s2_290, __p3_290)}); \
__ret_290 = vfmlsl_high_f16(__s0_290, __s1_290, (float16x4_t) {vgetq_lane_f16(__s2_290, __p3_290), vgetq_lane_f16(__s2_290, __p3_290), vgetq_lane_f16(__s2_290, __p3_290), vgetq_lane_f16(__s2_290, __p3_290)}); \
__ret_290; \
})
#else
#define vfmlsl_laneq_high_u32(__p0_291, __p1_291, __p2_291, __p3_291) __extension__ ({ \
#define vfmlsl_laneq_high_f16(__p0_291, __p1_291, __p2_291, __p3_291) __extension__ ({ \
float32x2_t __s0_291 = __p0_291; \
float16x4_t __s1_291 = __p1_291; \
float16x8_t __s2_291 = __p2_291; \
@@ -72241,23 +72241,23 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in
float16x4_t __rev1_291; __rev1_291 = __builtin_shufflevector(__s1_291, __s1_291, 3, 2, 1, 0); \
float16x8_t __rev2_291; __rev2_291 = __builtin_shufflevector(__s2_291, __s2_291, 7, 6, 5, 4, 3, 2, 1, 0); \
float32x2_t __ret_291; \
__ret_291 = __noswap_vfmlsl_high_u32(__rev0_291, __rev1_291, (float16x4_t) {__noswap_vgetq_lane_f16(__rev2_291, __p3_291), __noswap_vgetq_lane_f16(__rev2_291, __p3_291), __noswap_vgetq_lane_f16(__rev2_291, __p3_291), __noswap_vgetq_lane_f16(__rev2_291, __p3_291)}); \
__ret_291 = __noswap_vfmlsl_high_f16(__rev0_291, __rev1_291, (float16x4_t) {__noswap_vgetq_lane_f16(__rev2_291, __p3_291), __noswap_vgetq_lane_f16(__rev2_291, __p3_291), __noswap_vgetq_lane_f16(__rev2_291, __p3_291), __noswap_vgetq_lane_f16(__rev2_291, __p3_291)}); \
__ret_291 = __builtin_shufflevector(__ret_291, __ret_291, 1, 0); \
__ret_291; \
})
#endif
#ifdef __LITTLE_ENDIAN__
#define vfmlslq_laneq_low_u32(__p0_292, __p1_292, __p2_292, __p3_292) __extension__ ({ \
#define vfmlslq_laneq_low_f16(__p0_292, __p1_292, __p2_292, __p3_292) __extension__ ({ \
float32x4_t __s0_292 = __p0_292; \
float16x8_t __s1_292 = __p1_292; \
float16x8_t __s2_292 = __p2_292; \
float32x4_t __ret_292; \
__ret_292 = vfmlslq_low_u32(__s0_292, __s1_292, (float16x8_t) {vgetq_lane_f16(__s2_292, __p3_292), vgetq_lane_f16(__s2_292, __p3_292), vgetq_lane_f16(__s2_292, __p3_292), vgetq_lane_f16(__s2_292, __p3_292), vgetq_lane_f16(__s2_292, __p3_292), vgetq_lane_f16(__s2_292, __p3_292), vgetq_lane_f16(__s2_292, __p3_292), vgetq_lane_f16(__s2_292, __p3_292)}); \
__ret_292 = vfmlslq_low_f16(__s0_292, __s1_292, (float16x8_t) {vgetq_lane_f16(__s2_292, __p3_292), vgetq_lane_f16(__s2_292, __p3_292), vgetq_lane_f16(__s2_292, __p3_292), vgetq_lane_f16(__s2_292, __p3_292), vgetq_lane_f16(__s2_292, __p3_292), vgetq_lane_f16(__s2_292, __p3_292), vgetq_lane_f16(__s2_292, __p3_292), vgetq_lane_f16(__s2_292, __p3_292)}); \
__ret_292; \
})
#else
#define vfmlslq_laneq_low_u32(__p0_293, __p1_293, __p2_293, __p3_293) __extension__ ({ \
#define vfmlslq_laneq_low_f16(__p0_293, __p1_293, __p2_293, __p3_293) __extension__ ({ \
float32x4_t __s0_293 = __p0_293; \
float16x8_t __s1_293 = __p1_293; \
float16x8_t __s2_293 = __p2_293; \
@@ -72265,23 +72265,23 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in
float16x8_t __rev1_293; __rev1_293 = __builtin_shufflevector(__s1_293, __s1_293, 7, 6, 5, 4, 3, 2, 1, 0); \
float16x8_t __rev2_293; __rev2_293 = __builtin_shufflevector(__s2_293, __s2_293, 7, 6, 5, 4, 3, 2, 1, 0); \
float32x4_t __ret_293; \
__ret_293 = __noswap_vfmlslq_low_u32(__rev0_293, __rev1_293, (float16x8_t) {__noswap_vgetq_lane_f16(__rev2_293, __p3_293), __noswap_vgetq_lane_f16(__rev2_293, __p3_293), __noswap_vgetq_lane_f16(__rev2_293, __p3_293), __noswap_vgetq_lane_f16(__rev2_293, __p3_293), __noswap_vgetq_lane_f16(__rev2_293, __p3_293), __noswap_vgetq_lane_f16(__rev2_293, __p3_293), __noswap_vgetq_lane_f16(__rev2_293, __p3_293), __noswap_vgetq_lane_f16(__rev2_293, __p3_293)}); \
__ret_293 = __noswap_vfmlslq_low_f16(__rev0_293, __rev1_293, (float16x8_t) {__noswap_vgetq_lane_f16(__rev2_293, __p3_293), __noswap_vgetq_lane_f16(__rev2_293, __p3_293), __noswap_vgetq_lane_f16(__rev2_293, __p3_293), __noswap_vgetq_lane_f16(__rev2_293, __p3_293), __noswap_vgetq_lane_f16(__rev2_293, __p3_293), __noswap_vgetq_lane_f16(__rev2_293, __p3_293), __noswap_vgetq_lane_f16(__rev2_293, __p3_293), __noswap_vgetq_lane_f16(__rev2_293, __p3_293)}); \
__ret_293 = __builtin_shufflevector(__ret_293, __ret_293, 3, 2, 1, 0); \
__ret_293; \
})
#endif
#ifdef __LITTLE_ENDIAN__
#define vfmlsl_laneq_low_u32(__p0_294, __p1_294, __p2_294, __p3_294) __extension__ ({ \
#define vfmlsl_laneq_low_f16(__p0_294, __p1_294, __p2_294, __p3_294) __extension__ ({ \
float32x2_t __s0_294 = __p0_294; \
float16x4_t __s1_294 = __p1_294; \
float16x8_t __s2_294 = __p2_294; \
float32x2_t __ret_294; \
__ret_294 = vfmlsl_low_u32(__s0_294, __s1_294, (float16x4_t) {vgetq_lane_f16(__s2_294, __p3_294), vgetq_lane_f16(__s2_294, __p3_294), vgetq_lane_f16(__s2_294, __p3_294), vgetq_lane_f16(__s2_294, __p3_294)}); \
__ret_294 = vfmlsl_low_f16(__s0_294, __s1_294, (float16x4_t) {vgetq_lane_f16(__s2_294, __p3_294), vgetq_lane_f16(__s2_294, __p3_294), vgetq_lane_f16(__s2_294, __p3_294), vgetq_lane_f16(__s2_294, __p3_294)}); \
__ret_294; \
})
#else
#define vfmlsl_laneq_low_u32(__p0_295, __p1_295, __p2_295, __p3_295) __extension__ ({ \
#define vfmlsl_laneq_low_f16(__p0_295, __p1_295, __p2_295, __p3_295) __extension__ ({ \
float32x2_t __s0_295 = __p0_295; \
float16x4_t __s1_295 = __p1_295; \
float16x8_t __s2_295 = __p2_295; \
@@ -72289,7 +72289,7 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in
float16x4_t __rev1_295; __rev1_295 = __builtin_shufflevector(__s1_295, __s1_295, 3, 2, 1, 0); \
float16x8_t __rev2_295; __rev2_295 = __builtin_shufflevector(__s2_295, __s2_295, 7, 6, 5, 4, 3, 2, 1, 0); \
float32x2_t __ret_295; \
__ret_295 = __noswap_vfmlsl_low_u32(__rev0_295, __rev1_295, (float16x4_t) {__noswap_vgetq_lane_f16(__rev2_295, __p3_295), __noswap_vgetq_lane_f16(__rev2_295, __p3_295), __noswap_vgetq_lane_f16(__rev2_295, __p3_295), __noswap_vgetq_lane_f16(__rev2_295, __p3_295)}); \
__ret_295 = __noswap_vfmlsl_low_f16(__rev0_295, __rev1_295, (float16x4_t) {__noswap_vgetq_lane_f16(__rev2_295, __p3_295), __noswap_vgetq_lane_f16(__rev2_295, __p3_295), __noswap_vgetq_lane_f16(__rev2_295, __p3_295), __noswap_vgetq_lane_f16(__rev2_295, __p3_295)}); \
__ret_295 = __builtin_shufflevector(__ret_295, __ret_295, 1, 0); \
__ret_295; \
})
+3 -17
View File
@@ -1,22 +1,8 @@
/*===---- armintr.h - ARM Windows intrinsics -------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+5 -27
View File
@@ -1,22 +1,8 @@
/*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
@@ -146,21 +132,13 @@ _mm256_andnot_si256(__m256i __a, __m256i __b)
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_avg_epu8(__m256i __a, __m256i __b)
{
typedef unsigned short __v32hu __attribute__((__vector_size__(64)));
return (__m256i)__builtin_convertvector(
((__builtin_convertvector((__v32qu)__a, __v32hu) +
__builtin_convertvector((__v32qu)__b, __v32hu)) + 1)
>> 1, __v32qu);
return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_avg_epu16(__m256i __a, __m256i __b)
{
typedef unsigned int __v16su __attribute__((__vector_size__(64)));
return (__m256i)__builtin_convertvector(
((__builtin_convertvector((__v16hu)__a, __v16su) +
__builtin_convertvector((__v16hu)__b, __v16su)) + 1)
>> 1, __v16hu);
return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
+279
View File
@@ -0,0 +1,279 @@
/*===------------ avx512bf16intrin.h - AVX512_BF16 intrinsics --------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512bf16intrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVX512BF16INTRIN_H
#define __AVX512BF16INTRIN_H
typedef short __m512bh __attribute__((__vector_size__(64), __aligned__(64)));
typedef short __m256bh __attribute__((__vector_size__(32), __aligned__(32)));
typedef unsigned short __bfloat16;
#define __DEFAULT_FN_ATTRS512 \
__attribute__((__always_inline__, __nodebug__, __target__("avx512bf16"), \
__min_vector_width__(512)))
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("avx512bf16")))
/// Convert One BF16 Data to One Single Float Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic does not correspond to a specific instruction.
///
/// \param __A
/// A bfloat data.
/// \returns A float data whose sign field and exponent field keep unchanged,
/// and fraction field is extended to 23 bits.
static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bfloat16 __A) {
return __builtin_ia32_cvtsbf162ss_32(__A);
}
/// Convert Two Packed Single Data to One Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
///
/// \param __A
/// A 512-bit vector of [16 x float].
/// \param __B
/// A 512-bit vector of [16 x float].
/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
/// conversion of __B, and higher 256 bits come from conversion of __A.
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
_mm512_cvtne2ps_pbh(__m512 __A, __m512 __B) {
return (__m512bh)__builtin_ia32_cvtne2ps2bf16_512((__v16sf) __A,
(__v16sf) __B);
}
/// Convert Two Packed Single Data to One Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
///
/// \param __A
/// A 512-bit vector of [16 x float].
/// \param __B
/// A 512-bit vector of [16 x float].
/// \param __W
/// A 512-bit vector of [32 x bfloat].
/// \param __U
/// A 32-bit mask value specifying what is chosen for each element.
/// A 1 means conversion of __A or __B. A 0 means element from __W.
/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
/// conversion of __B, and higher 256 bits come from conversion of __A.
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
_mm512_mask_cvtne2ps_pbh(__m512bh __W, __mmask32 __U, __m512 __A, __m512 __B) {
return (__m512bh)__builtin_ia32_selectw_512((__mmask32)__U,
(__v32hi)_mm512_cvtne2ps_pbh(__A, __B),
(__v32hi)__W);
}
/// Convert Two Packed Single Data to One Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
///
/// \param __A
/// A 512-bit vector of [16 x float].
/// \param __B
/// A 512-bit vector of [16 x float].
/// \param __U
/// A 32-bit mask value specifying what is chosen for each element.
/// A 1 means conversion of __A or __B. A 0 means element is zero.
/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
/// conversion of __B, and higher 256 bits come from conversion of __A.
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtne2ps_pbh(__mmask32 __U, __m512 __A, __m512 __B) {
return (__m512bh)__builtin_ia32_selectw_512((__mmask32)__U,
(__v32hi)_mm512_cvtne2ps_pbh(__A, __B),
(__v32hi)_mm512_setzero_si512());
}
/// Convert Packed Single Data to Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
///
/// \param __A
/// A 512-bit vector of [16 x float].
/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
static __inline__ __m256bh __DEFAULT_FN_ATTRS512
_mm512_cvtneps_pbh(__m512 __A) {
return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
(__v16hi)_mm256_undefined_si256(),
(__mmask16)-1);
}
/// Convert Packed Single Data to Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
///
/// \param __A
/// A 512-bit vector of [16 x float].
/// \param __W
/// A 256-bit vector of [16 x bfloat].
/// \param __U
/// A 16-bit mask value specifying what is chosen for each element.
/// A 1 means conversion of __A. A 0 means element from __W.
/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
static __inline__ __m256bh __DEFAULT_FN_ATTRS512
_mm512_mask_cvtneps_pbh(__m256bh __W, __mmask16 __U, __m512 __A) {
return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
(__v16hi)__W,
(__mmask16)__U);
}
/// Convert Packed Single Data to Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
///
/// \param __A
/// A 512-bit vector of [16 x float].
/// \param __U
/// A 16-bit mask value specifying what is chosen for each element.
/// A 1 means conversion of __A. A 0 means element is zero.
/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
static __inline__ __m256bh __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtneps_pbh(__mmask16 __U, __m512 __A) {
return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
(__v16hi)_mm256_setzero_si256(),
(__mmask16)__U);
}
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
///
/// \param __A
/// A 512-bit vector of [32 x bfloat].
/// \param __B
/// A 512-bit vector of [32 x bfloat].
/// \param __D
/// A 512-bit vector of [16 x float].
/// \returns A 512-bit vector of [16 x float] comes from Dot Product of
/// __A, __B and __D
static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_dpbf16_ps(__m512 __D, __m512bh __A, __m512bh __B) {
return (__m512)__builtin_ia32_dpbf16ps_512((__v16sf) __D,
(__v16si) __A,
(__v16si) __B);
}
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
///
/// \param __A
/// A 512-bit vector of [32 x bfloat].
/// \param __B
/// A 512-bit vector of [32 x bfloat].
/// \param __D
/// A 512-bit vector of [16 x float].
/// \param __U
/// A 16-bit mask value specifying what is chosen for each element.
/// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
/// \returns A 512-bit vector of [16 x float] comes from Dot Product of
/// __A, __B and __D
static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_dpbf16_ps(__m512 __D, __mmask16 __U, __m512bh __A, __m512bh __B) {
return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
(__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
(__v16sf)__D);
}
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
///
/// \param __A
/// A 512-bit vector of [32 x bfloat].
/// \param __B
/// A 512-bit vector of [32 x bfloat].
/// \param __D
/// A 512-bit vector of [16 x float].
/// \param __U
/// A 16-bit mask value specifying what is chosen for each element.
/// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
/// \returns A 512-bit vector of [16 x float] comes from Dot Product of
/// __A, __B and __D
static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_dpbf16_ps(__mmask16 __U, __m512 __D, __m512bh __A, __m512bh __B) {
return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
(__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
(__v16sf)_mm512_setzero_si512());
}
/// Convert Packed BF16 Data to Packed float Data.
///
/// \headerfile <x86intrin.h>
///
/// \param __A
/// A 256-bit vector of [16 x bfloat].
/// \returns A 512-bit vector of [16 x float] come from convertion of __A
static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) {
return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
(__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
}
/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
///
/// \headerfile <x86intrin.h>
///
/// \param __U
/// A 16-bit mask. Elements are zeroed out when the corresponding mask
/// bit is not set.
/// \param __A
/// A 256-bit vector of [16 x bfloat].
/// \returns A 512-bit vector of [16 x float] come from convertion of __A
static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) {
return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
(__m512i)_mm512_maskz_cvtepi16_epi32((__mmask16)__U, (__m256i)__A), 16));
}
/// Convert Packed BF16 Data to Packed float Data using merging mask.
///
/// \headerfile <x86intrin.h>
///
/// \param __S
/// A 512-bit vector of [16 x float]. Elements are copied from __S when
/// the corresponding mask bit is not set.
/// \param __U
/// A 16-bit mask.
/// \param __A
/// A 256-bit vector of [16 x bfloat].
/// \returns A 512-bit vector of [16 x float] come from convertion of __A
static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) {
return _mm512_castsi512_ps((__m512i)_mm512_mask_slli_epi32(
(__m512i)__S, (__mmask16)__U,
(__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
}
#undef __DEFAULT_FN_ATTRS
#undef __DEFAULT_FN_ATTRS512
#endif
+3 -17
View File
@@ -1,23 +1,9 @@
/*===------------- avx512bitalgintrin.h - BITALG intrinsics ------------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+11 -33
View File
@@ -1,23 +1,9 @@
/*===------------- avx512bwintrin.h - AVX512BW intrinsics ------------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
@@ -719,11 +705,7 @@ _mm512_maskz_adds_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_avg_epu8 (__m512i __A, __m512i __B)
{
typedef unsigned short __v64hu __attribute__((__vector_size__(128)));
return (__m512i)__builtin_convertvector(
((__builtin_convertvector((__v64qu) __A, __v64hu) +
__builtin_convertvector((__v64qu) __B, __v64hu)) + 1)
>> 1, __v64qu);
return (__m512i)__builtin_ia32_pavgb512((__v64qi)__A, (__v64qi)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS512
@@ -746,11 +728,7 @@ _mm512_maskz_avg_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_avg_epu16 (__m512i __A, __m512i __B)
{
typedef unsigned int __v32su __attribute__((__vector_size__(128)));
return (__m512i)__builtin_convertvector(
((__builtin_convertvector((__v32hu) __A, __v32su) +
__builtin_convertvector((__v32hu) __B, __v32su)) + 1)
>> 1, __v32hu);
return (__m512i)__builtin_ia32_pavgw512((__v32hi)__A, (__v32hi)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS512
@@ -1733,14 +1711,14 @@ _mm512_maskz_set1_epi8 (__mmask64 __M, char __A)
(__v64qi) _mm512_setzero_si512());
}
static __inline__ __mmask64 __DEFAULT_FN_ATTRS512
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
_mm512_kunpackd (__mmask64 __A, __mmask64 __B)
{
return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A,
(__mmask64) __B);
}
static __inline__ __mmask32 __DEFAULT_FN_ATTRS512
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
_mm512_kunpackw (__mmask32 __A, __mmask32 __B)
{
return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A,
@@ -1751,7 +1729,7 @@ static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_loadu_epi16 (void const *__P)
{
struct __loadu_epi16 {
__m512i __v;
__m512i_u __v;
} __attribute__((__packed__, __may_alias__));
return ((struct __loadu_epi16*)__P)->__v;
}
@@ -1777,7 +1755,7 @@ static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_loadu_epi8 (void const *__P)
{
struct __loadu_epi8 {
__m512i __v;
__m512i_u __v;
} __attribute__((__packed__, __may_alias__));
return ((struct __loadu_epi8*)__P)->__v;
}
@@ -1803,7 +1781,7 @@ static __inline void __DEFAULT_FN_ATTRS512
_mm512_storeu_epi16 (void *__P, __m512i __A)
{
struct __storeu_epi16 {
__m512i __v;
__m512i_u __v;
} __attribute__((__packed__, __may_alias__));
((struct __storeu_epi16*)__P)->__v = __A;
}
@@ -1820,7 +1798,7 @@ static __inline void __DEFAULT_FN_ATTRS512
_mm512_storeu_epi8 (void *__P, __m512i __A)
{
struct __storeu_epi8 {
__m512i __v;
__m512i_u __v;
} __attribute__((__packed__, __may_alias__));
((struct __storeu_epi8*)__P)->__v = __A;
}
+17 -35
View File
@@ -1,23 +1,9 @@
/*===------------- avx512cdintrin.h - AVX512CD intrinsics ------------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
@@ -34,49 +20,45 @@
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_conflict_epi64 (__m512i __A)
{
return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
(__v8di) _mm512_setzero_si512 (),
(__mmask8) -1);
return (__m512i) __builtin_ia32_vpconflictdi_512 ((__v8di) __A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_conflict_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
{
return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
(__v8di) __W,
(__mmask8) __U);
return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
(__v8di)_mm512_conflict_epi64(__A),
(__v8di)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_conflict_epi64 (__mmask8 __U, __m512i __A)
{
return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
(__v8di) _mm512_setzero_si512 (),
(__mmask8) __U);
return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
(__v8di)_mm512_conflict_epi64(__A),
(__v8di)_mm512_setzero_si512 ());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_conflict_epi32 (__m512i __A)
{
return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
(__v16si) _mm512_setzero_si512 (),
(__mmask16) -1);
return (__m512i) __builtin_ia32_vpconflictsi_512 ((__v16si) __A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_conflict_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
{
return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
(__v16si) __W,
(__mmask16) __U);
return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
(__v16si)_mm512_conflict_epi32(__A),
(__v16si)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_conflict_epi32 (__mmask16 __U, __m512i __A)
{
return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
(__v16si) _mm512_setzero_si512 (),
(__mmask16) __U);
return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
(__v16si)_mm512_conflict_epi32(__A),
(__v16si)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
+3 -17
View File
@@ -1,22 +1,8 @@
/*===---- avx512dqintrin.h - AVX512DQ intrinsics ---------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+3 -17
View File
@@ -1,22 +1,8 @@
/*===---- avx512erintrin.h - AVX512ER intrinsics ---------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+47 -68
View File
@@ -1,22 +1,8 @@
/*===---- avx512fintrin.h - AVX512F intrinsics -----------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
@@ -40,9 +26,13 @@ typedef unsigned short __v32hu __attribute__((__vector_size__(64)));
typedef unsigned long long __v8du __attribute__((__vector_size__(64)));
typedef unsigned int __v16su __attribute__((__vector_size__(64)));
typedef float __m512 __attribute__((__vector_size__(64)));
typedef double __m512d __attribute__((__vector_size__(64)));
typedef long long __m512i __attribute__((__vector_size__(64)));
typedef float __m512 __attribute__((__vector_size__(64), __aligned__(64)));
typedef double __m512d __attribute__((__vector_size__(64), __aligned__(64)));
typedef long long __m512i __attribute__((__vector_size__(64), __aligned__(64)));
typedef float __m512_u __attribute__((__vector_size__(64), __aligned__(1)));
typedef double __m512d_u __attribute__((__vector_size__(64), __aligned__(1)));
typedef long long __m512i_u __attribute__((__vector_size__(64), __aligned__(1)));
typedef unsigned char __mmask8;
typedef unsigned short __mmask16;
@@ -1991,12 +1981,12 @@ _mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) {
#define _mm512_mask_add_round_pd(W, U, A, B, R) \
(__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
(__v8df)_mm512_add_round_pd((A), (B), (R)), \
(__v8df)(__m512d)(W));
(__v8df)(__m512d)(W))
#define _mm512_maskz_add_round_pd(U, A, B, R) \
(__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
(__v8df)_mm512_add_round_pd((A), (B), (R)), \
(__v8df)_mm512_setzero_pd());
(__v8df)_mm512_setzero_pd())
#define _mm512_add_round_ps(A, B, R) \
(__m512)__builtin_ia32_addps512((__v16sf)(__m512)(A), \
@@ -2005,12 +1995,12 @@ _mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) {
#define _mm512_mask_add_round_ps(W, U, A, B, R) \
(__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
(__v16sf)_mm512_add_round_ps((A), (B), (R)), \
(__v16sf)(__m512)(W));
(__v16sf)(__m512)(W))
#define _mm512_maskz_add_round_ps(U, A, B, R) \
(__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
(__v16sf)_mm512_add_round_ps((A), (B), (R)), \
(__v16sf)_mm512_setzero_ps());
(__v16sf)_mm512_setzero_ps())
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_mask_sub_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
@@ -2106,12 +2096,12 @@ _mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) {
#define _mm512_mask_sub_round_pd(W, U, A, B, R) \
(__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
(__v8df)_mm512_sub_round_pd((A), (B), (R)), \
(__v8df)(__m512d)(W));
(__v8df)(__m512d)(W))
#define _mm512_maskz_sub_round_pd(U, A, B, R) \
(__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
(__v8df)_mm512_sub_round_pd((A), (B), (R)), \
(__v8df)_mm512_setzero_pd());
(__v8df)_mm512_setzero_pd())
#define _mm512_sub_round_ps(A, B, R) \
(__m512)__builtin_ia32_subps512((__v16sf)(__m512)(A), \
@@ -2120,12 +2110,12 @@ _mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) {
#define _mm512_mask_sub_round_ps(W, U, A, B, R) \
(__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
(__v16sf)_mm512_sub_round_ps((A), (B), (R)), \
(__v16sf)(__m512)(W));
(__v16sf)(__m512)(W))
#define _mm512_maskz_sub_round_ps(U, A, B, R) \
(__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
(__v16sf)_mm512_sub_round_ps((A), (B), (R)), \
(__v16sf)_mm512_setzero_ps());
(__v16sf)_mm512_setzero_ps())
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_mask_mul_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
@@ -2221,12 +2211,12 @@ _mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) {
#define _mm512_mask_mul_round_pd(W, U, A, B, R) \
(__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
(__v8df)_mm512_mul_round_pd((A), (B), (R)), \
(__v8df)(__m512d)(W));
(__v8df)(__m512d)(W))
#define _mm512_maskz_mul_round_pd(U, A, B, R) \
(__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
(__v8df)_mm512_mul_round_pd((A), (B), (R)), \
(__v8df)_mm512_setzero_pd());
(__v8df)_mm512_setzero_pd())
#define _mm512_mul_round_ps(A, B, R) \
(__m512)__builtin_ia32_mulps512((__v16sf)(__m512)(A), \
@@ -2235,12 +2225,12 @@ _mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) {
#define _mm512_mask_mul_round_ps(W, U, A, B, R) \
(__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
(__v16sf)_mm512_mul_round_ps((A), (B), (R)), \
(__v16sf)(__m512)(W));
(__v16sf)(__m512)(W))
#define _mm512_maskz_mul_round_ps(U, A, B, R) \
(__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
(__v16sf)_mm512_mul_round_ps((A), (B), (R)), \
(__v16sf)_mm512_setzero_ps());
(__v16sf)_mm512_setzero_ps())
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_mask_div_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
@@ -2349,12 +2339,12 @@ _mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) {
#define _mm512_mask_div_round_pd(W, U, A, B, R) \
(__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
(__v8df)_mm512_div_round_pd((A), (B), (R)), \
(__v8df)(__m512d)(W));
(__v8df)(__m512d)(W))
#define _mm512_maskz_div_round_pd(U, A, B, R) \
(__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
(__v8df)_mm512_div_round_pd((A), (B), (R)), \
(__v8df)_mm512_setzero_pd());
(__v8df)_mm512_setzero_pd())
#define _mm512_div_round_ps(A, B, R) \
(__m512)__builtin_ia32_divps512((__v16sf)(__m512)(A), \
@@ -2363,12 +2353,12 @@ _mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) {
#define _mm512_mask_div_round_ps(W, U, A, B, R) \
(__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
(__v16sf)_mm512_div_round_ps((A), (B), (R)), \
(__v16sf)(__m512)(W));
(__v16sf)(__m512)(W))
#define _mm512_maskz_div_round_ps(U, A, B, R) \
(__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
(__v16sf)_mm512_div_round_ps((A), (B), (R)), \
(__v16sf)_mm512_setzero_ps());
(__v16sf)_mm512_setzero_ps())
#define _mm512_roundscale_ps(A, B) \
(__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(B), \
@@ -3789,20 +3779,9 @@ _mm512_mask_cvtpd_pslo (__m512 __W, __mmask8 __U,__m512d __A)
(__v16hi)_mm256_setzero_si256(), \
(__mmask16)(W))
#define _mm512_cvtps_ph(A, I) \
(__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
(__v16hi)_mm256_setzero_si256(), \
(__mmask16)-1)
#define _mm512_mask_cvtps_ph(U, W, A, I) \
(__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
(__v16hi)(__m256i)(U), \
(__mmask16)(W))
#define _mm512_maskz_cvtps_ph(W, A, I) \
(__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
(__v16hi)_mm256_setzero_si256(), \
(__mmask16)(W))
#define _mm512_cvtps_ph _mm512_cvt_roundps_ph
#define _mm512_mask_cvtps_ph _mm512_mask_cvt_roundps_ph
#define _mm512_maskz_cvtps_ph _mm512_maskz_cvt_roundps_ph
#define _mm512_cvt_roundph_ps(A, R) \
(__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
@@ -4324,7 +4303,7 @@ static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_loadu_si512 (void const *__P)
{
struct __loadu_si512 {
__m512i __v;
__m512i_u __v;
} __attribute__((__packed__, __may_alias__));
return ((struct __loadu_si512*)__P)->__v;
}
@@ -4333,7 +4312,7 @@ static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_loadu_epi32 (void const *__P)
{
struct __loadu_epi32 {
__m512i __v;
__m512i_u __v;
} __attribute__((__packed__, __may_alias__));
return ((struct __loadu_epi32*)__P)->__v;
}
@@ -4360,7 +4339,7 @@ static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_loadu_epi64 (void const *__P)
{
struct __loadu_epi64 {
__m512i __v;
__m512i_u __v;
} __attribute__((__packed__, __may_alias__));
return ((struct __loadu_epi64*)__P)->__v;
}
@@ -4420,7 +4399,7 @@ static __inline __m512d __DEFAULT_FN_ATTRS512
_mm512_loadu_pd(void const *__p)
{
struct __loadu_pd {
__m512d __v;
__m512d_u __v;
} __attribute__((__packed__, __may_alias__));
return ((struct __loadu_pd*)__p)->__v;
}
@@ -4429,7 +4408,7 @@ static __inline __m512 __DEFAULT_FN_ATTRS512
_mm512_loadu_ps(void const *__p)
{
struct __loadu_ps {
__m512 __v;
__m512_u __v;
} __attribute__((__packed__, __may_alias__));
return ((struct __loadu_ps*)__p)->__v;
}
@@ -4504,7 +4483,7 @@ static __inline void __DEFAULT_FN_ATTRS512
_mm512_storeu_epi64 (void *__P, __m512i __A)
{
struct __storeu_epi64 {
__m512i __v;
__m512i_u __v;
} __attribute__((__packed__, __may_alias__));
((struct __storeu_epi64*)__P)->__v = __A;
}
@@ -4520,7 +4499,7 @@ static __inline void __DEFAULT_FN_ATTRS512
_mm512_storeu_si512 (void *__P, __m512i __A)
{
struct __storeu_si512 {
__m512i __v;
__m512i_u __v;
} __attribute__((__packed__, __may_alias__));
((struct __storeu_si512*)__P)->__v = __A;
}
@@ -4529,7 +4508,7 @@ static __inline void __DEFAULT_FN_ATTRS512
_mm512_storeu_epi32 (void *__P, __m512i __A)
{
struct __storeu_epi32 {
__m512i __v;
__m512i_u __v;
} __attribute__((__packed__, __may_alias__));
((struct __storeu_epi32*)__P)->__v = __A;
}
@@ -4551,7 +4530,7 @@ static __inline void __DEFAULT_FN_ATTRS512
_mm512_storeu_pd(void *__P, __m512d __A)
{
struct __storeu_pd {
__m512d __v;
__m512d_u __v;
} __attribute__((__packed__, __may_alias__));
((struct __storeu_pd*)__P)->__v = __A;
}
@@ -4567,7 +4546,7 @@ static __inline void __DEFAULT_FN_ATTRS512
_mm512_storeu_ps(void *__P, __m512 __A)
{
struct __storeu_ps {
__m512 __v;
__m512_u __v;
} __attribute__((__packed__, __may_alias__));
((struct __storeu_ps*)__P)->__v = __A;
}
@@ -9329,7 +9308,7 @@ _mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A)
__v2du __t6 = __t4 op __t5; \
__v2du __t7 = __builtin_shufflevector(__t6, __t6, 1, 0); \
__v2du __t8 = __t6 op __t7; \
return __t8[0];
return __t8[0]
static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_add_epi64(__m512i __W) {
_mm512_mask_reduce_operator(+);
@@ -9381,7 +9360,7 @@ _mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W) {
__m128d __t6 = __t4 op __t5; \
__m128d __t7 = __builtin_shufflevector(__t6, __t6, 1, 0); \
__m128d __t8 = __t6 op __t7; \
return __t8[0];
return __t8[0]
static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_add_pd(__m512d __W) {
_mm512_mask_reduce_operator(+);
@@ -9415,7 +9394,7 @@ _mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W) {
__v4su __t8 = __t6 op __t7; \
__v4su __t9 = __builtin_shufflevector(__t8, __t8, 1, 0, 3, 2); \
__v4su __t10 = __t8 op __t9; \
return __t10[0];
return __t10[0]
static __inline__ int __DEFAULT_FN_ATTRS512
_mm512_reduce_add_epi32(__m512i __W) {
@@ -9473,7 +9452,7 @@ _mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W) {
__m128 __t8 = __t6 op __t7; \
__m128 __t9 = __builtin_shufflevector(__t8, __t8, 1, 0, 3, 2); \
__m128 __t10 = __t8 op __t9; \
return __t10[0];
return __t10[0]
static __inline__ float __DEFAULT_FN_ATTRS512
_mm512_reduce_add_ps(__m512 __W) {
@@ -9505,7 +9484,7 @@ _mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) {
__m512i __t4 = _mm512_##op(__t2, __t3); \
__m512i __t5 = (__m512i)__builtin_shufflevector((__v8di)__t4, (__v8di)__t4, 1, 0, 3, 2, 5, 4, 7, 6); \
__v8di __t6 = (__v8di)_mm512_##op(__t4, __t5); \
return __t6[0];
return __t6[0]
static __inline__ long long __DEFAULT_FN_ATTRS512
_mm512_reduce_max_epi64(__m512i __V) {
@@ -9563,7 +9542,7 @@ _mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) {
__m128i __t8 = _mm_##op(__t6, __t7); \
__m128i __t9 = (__m128i)__builtin_shufflevector((__v4si)__t8, (__v4si)__t8, 1, 0, 3, 2); \
__v4si __t10 = (__v4si)_mm_##op(__t8, __t9); \
return __t10[0];
return __t10[0]
static __inline__ int __DEFAULT_FN_ATTRS512
_mm512_reduce_max_epi32(__m512i __V) {
@@ -9619,7 +9598,7 @@ _mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) {
__m128d __t6 = _mm_##op(__t4, __t5); \
__m128d __t7 = __builtin_shufflevector(__t6, __t6, 1, 0); \
__m128d __t8 = _mm_##op(__t6, __t7); \
return __t8[0];
return __t8[0]
static __inline__ double __DEFAULT_FN_ATTRS512
_mm512_reduce_max_pd(__m512d __V) {
@@ -9655,7 +9634,7 @@ _mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __V) {
__m128 __t8 = _mm_##op(__t6, __t7); \
__m128 __t9 = __builtin_shufflevector(__t8, __t8, 1, 0, 3, 2); \
__m128 __t10 = _mm_##op(__t8, __t9); \
return __t10[0];
return __t10[0]
static __inline__ float __DEFAULT_FN_ATTRS512
_mm512_reduce_max_ps(__m512 __V) {
+3 -17
View File
@@ -1,23 +1,9 @@
/*===------------- avx512ifmaintrin.h - IFMA intrinsics ------------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+3 -17
View File
@@ -1,23 +1,9 @@
/*===------------- avx512ifmavlintrin.h - IFMA intrinsics ------------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+3 -17
View File
@@ -1,23 +1,9 @@
/*===------------- avx512pfintrin.h - PF intrinsics ------------------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+3 -17
View File
@@ -1,23 +1,9 @@
/*===------------- avx512vbmi2intrin.h - VBMI2 intrinsics ------------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+3 -17
View File
@@ -1,23 +1,9 @@
/*===------------- avx512vbmiintrin.h - VBMI intrinsics ------------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+3 -17
View File
@@ -1,23 +1,9 @@
/*===------------- avx512vbmivlintrin.h - VBMI intrinsics ------------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+474
View File
@@ -0,0 +1,474 @@
/*===--------- avx512vlbf16intrin.h - AVX512_BF16 intrinsics ---------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512vlbf16intrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVX512VLBF16INTRIN_H
#define __AVX512VLBF16INTRIN_H
typedef short __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
#define __DEFAULT_FN_ATTRS128 \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512vl, avx512bf16"), __min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512vl, avx512bf16"), __min_vector_width__(256)))
/// Convert Two Packed Single Data to One Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
///
/// \param __A
/// A 128-bit vector of [4 x float].
/// \param __B
/// A 128-bit vector of [4 x float].
/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
/// conversion of __B, and higher 64 bits come from conversion of __A.
static __inline__ __m128bh __DEFAULT_FN_ATTRS128
_mm_cvtne2ps_pbh(__m128 __A, __m128 __B) {
return (__m128bh)__builtin_ia32_cvtne2ps2bf16_128((__v4sf) __A,
(__v4sf) __B);
}
/// Convert Two Packed Single Data to One Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
///
/// \param __A
/// A 128-bit vector of [4 x float].
/// \param __B
/// A 128-bit vector of [4 x float].
/// \param __W
/// A 128-bit vector of [8 x bfloat].
/// \param __U
/// A 8-bit mask value specifying what is chosen for each element.
/// A 1 means conversion of __A or __B. A 0 means element from __W.
/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
/// conversion of __B, and higher 64 bits come from conversion of __A.
static __inline__ __m128bh __DEFAULT_FN_ATTRS128
_mm_mask_cvtne2ps_pbh(__m128bh __W, __mmask8 __U, __m128 __A, __m128 __B) {
return (__m128bh)__builtin_ia32_selectw_128((__mmask8)__U,
(__v8hi)_mm_cvtne2ps_pbh(__A, __B),
(__v8hi)__W);
}
/// Convert Two Packed Single Data to One Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
///
/// \param __A
/// A 128-bit vector of [4 x float].
/// \param __B
/// A 128-bit vector of [4 x float].
/// \param __U
/// A 8-bit mask value specifying what is chosen for each element.
/// A 1 means conversion of __A or __B. A 0 means element is zero.
/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
/// conversion of __B, and higher 64 bits come from conversion of __A.
static __inline__ __m128bh __DEFAULT_FN_ATTRS128
_mm_maskz_cvtne2ps_pbh(__mmask8 __U, __m128 __A, __m128 __B) {
return (__m128bh)__builtin_ia32_selectw_128((__mmask8)__U,
(__v8hi)_mm_cvtne2ps_pbh(__A, __B),
(__v8hi)_mm_setzero_si128());
}
/// Convert Two Packed Single Data to One Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
///
/// \param __A
/// A 256-bit vector of [8 x float].
/// \param __B
/// A 256-bit vector of [8 x float].
/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
/// conversion of __B, and higher 128 bits come from conversion of __A.
static __inline__ __m256bh __DEFAULT_FN_ATTRS256
_mm256_cvtne2ps_pbh(__m256 __A, __m256 __B) {
return (__m256bh)__builtin_ia32_cvtne2ps2bf16_256((__v8sf) __A,
(__v8sf) __B);
}
/// Convert Two Packed Single Data to One Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
///
/// \param __A
/// A 256-bit vector of [8 x float].
/// \param __B
/// A 256-bit vector of [8 x float].
/// \param __W
/// A 256-bit vector of [16 x bfloat].
/// \param __U
/// A 16-bit mask value specifying what is chosen for each element.
/// A 1 means conversion of __A or __B. A 0 means element from __W.
/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
/// conversion of __B, and higher 128 bits come from conversion of __A.
static __inline__ __m256bh __DEFAULT_FN_ATTRS256
_mm256_mask_cvtne2ps_pbh(__m256bh __W, __mmask16 __U, __m256 __A, __m256 __B) {
return (__m256bh)__builtin_ia32_selectw_256((__mmask16)__U,
(__v16hi)_mm256_cvtne2ps_pbh(__A, __B),
(__v16hi)__W);
}
/// Convert Two Packed Single Data to One Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
///
/// \param __A
/// A 256-bit vector of [8 x float].
/// \param __B
/// A 256-bit vector of [8 x float].
/// \param __U
/// A 16-bit mask value specifying what is chosen for each element.
/// A 1 means conversion of __A or __B. A 0 means element is zero.
/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
/// conversion of __B, and higher 128 bits come from conversion of __A.
static __inline__ __m256bh __DEFAULT_FN_ATTRS256
_mm256_maskz_cvtne2ps_pbh(__mmask16 __U, __m256 __A, __m256 __B) {
return (__m256bh)__builtin_ia32_selectw_256((__mmask16)__U,
(__v16hi)_mm256_cvtne2ps_pbh(__A, __B),
(__v16hi)_mm256_setzero_si256());
}
/// Convert Packed Single Data to Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
///
/// \param __A
/// A 128-bit vector of [4 x float].
/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
/// conversion of __A, and higher 64 bits are 0.
static __inline__ __m128bh __DEFAULT_FN_ATTRS128
_mm_cvtneps_pbh(__m128 __A) {
return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
(__v8hi)_mm_undefined_si128(),
(__mmask8)-1);
}
/// Convert Packed Single Data to Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
///
/// \param __A
/// A 128-bit vector of [4 x float].
/// \param __W
/// A 128-bit vector of [8 x bfloat].
/// \param __U
/// A 4-bit mask value specifying what is chosen for each element.
/// A 1 means conversion of __A. A 0 means element from __W.
/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
/// conversion of __A, and higher 64 bits are 0.
static __inline__ __m128bh __DEFAULT_FN_ATTRS128
_mm_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m128 __A) {
return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
(__v8hi)__W,
(__mmask8)__U);
}
/// Convert Packed Single Data to Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
///
/// \param __A
/// A 128-bit vector of [4 x float].
/// \param __U
/// A 4-bit mask value specifying what is chosen for each element.
/// A 1 means conversion of __A. A 0 means element is zero.
/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
/// conversion of __A, and higher 64 bits are 0.
static __inline__ __m128bh __DEFAULT_FN_ATTRS128
_mm_maskz_cvtneps_pbh(__mmask8 __U, __m128 __A) {
return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
(__v8hi)_mm_setzero_si128(),
(__mmask8)__U);
}
/// Convert Packed Single Data to Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
///
/// \param __A
/// A 256-bit vector of [8 x float].
/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
static __inline__ __m128bh __DEFAULT_FN_ATTRS256
_mm256_cvtneps_pbh(__m256 __A) {
return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
(__v8hi)_mm_undefined_si128(),
(__mmask8)-1);
}
/// Convert Packed Single Data to Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
///
/// \param __A
/// A 256-bit vector of [8 x float].
/// \param __W
/// A 256-bit vector of [8 x bfloat].
/// \param __U
/// A 8-bit mask value specifying what is chosen for each element.
/// A 1 means conversion of __A. A 0 means element from __W.
/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
static __inline__ __m128bh __DEFAULT_FN_ATTRS256
_mm256_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m256 __A) {
return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
(__v8hi)__W,
(__mmask8)__U);
}
/// Convert Packed Single Data to Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
///
/// \param __A
/// A 256-bit vector of [8 x float].
/// \param __U
/// A 8-bit mask value specifying what is chosen for each element.
/// A 1 means conversion of __A. A 0 means element is zero.
/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
static __inline__ __m128bh __DEFAULT_FN_ATTRS256
_mm256_maskz_cvtneps_pbh(__mmask8 __U, __m256 __A) {
return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
(__v8hi)_mm_setzero_si128(),
(__mmask8)__U);
}
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
///
/// \param __A
/// A 128-bit vector of [8 x bfloat].
/// \param __B
/// A 128-bit vector of [8 x bfloat].
/// \param __D
/// A 128-bit vector of [4 x float].
/// \returns A 128-bit vector of [4 x float] comes from Dot Product of
/// __A, __B and __D
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_dpbf16_ps(__m128 __D, __m128bh __A, __m128bh __B) {
return (__m128)__builtin_ia32_dpbf16ps_128((__v4sf)__D,
(__v4si)__A,
(__v4si)__B);
}
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
///
/// \param __A
/// A 128-bit vector of [8 x bfloat].
/// \param __B
/// A 128-bit vector of [8 x bfloat].
/// \param __D
/// A 128-bit vector of [4 x float].
/// \param __U
/// A 8-bit mask value specifying what is chosen for each element.
/// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
/// \returns A 128-bit vector of [4 x float] comes from Dot Product of
/// __A, __B and __D
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_mask_dpbf16_ps(__m128 __D, __mmask8 __U, __m128bh __A, __m128bh __B) {
return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
(__v4sf)_mm_dpbf16_ps(__D, __A, __B),
(__v4sf)__D);
}
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
///
/// \param __A
/// A 128-bit vector of [8 x bfloat].
/// \param __B
/// A 128-bit vector of [8 x bfloat].
/// \param __D
/// A 128-bit vector of [4 x float].
/// \param __U
/// A 8-bit mask value specifying what is chosen for each element.
/// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
/// \returns A 128-bit vector of [4 x float] comes from Dot Product of
/// __A, __B and __D
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_maskz_dpbf16_ps(__mmask8 __U, __m128 __D, __m128bh __A, __m128bh __B) {
return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
(__v4sf)_mm_dpbf16_ps(__D, __A, __B),
(__v4sf)_mm_setzero_si128());
}
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
///
/// \param __A
/// A 256-bit vector of [16 x bfloat].
/// \param __B
/// A 256-bit vector of [16 x bfloat].
/// \param __D
/// A 256-bit vector of [8 x float].
/// \returns A 256-bit vector of [8 x float] comes from Dot Product of
/// __A, __B and __D
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_dpbf16_ps(__m256 __D, __m256bh __A, __m256bh __B) {
return (__m256)__builtin_ia32_dpbf16ps_256((__v8sf)__D,
(__v8si)__A,
(__v8si)__B);
}
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
///
/// \param __A
/// A 256-bit vector of [16 x bfloat].
/// \param __B
/// A 256-bit vector of [16 x bfloat].
/// \param __D
/// A 256-bit vector of [8 x float].
/// \param __U
/// A 16-bit mask value specifying what is chosen for each element.
/// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
/// \returns A 256-bit vector of [8 x float] comes from Dot Product of
/// __A, __B and __D
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_mask_dpbf16_ps(__m256 __D, __mmask8 __U, __m256bh __A, __m256bh __B) {
return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
(__v8sf)_mm256_dpbf16_ps(__D, __A, __B),
(__v8sf)__D);
}
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
///
/// \param __A
/// A 256-bit vector of [16 x bfloat].
/// \param __B
/// A 256-bit vector of [16 x bfloat].
/// \param __D
/// A 256-bit vector of [8 x float].
/// \param __U
/// A 8-bit mask value specifying what is chosen for each element.
/// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
/// \returns A 256-bit vector of [8 x float] comes from Dot Product of
/// __A, __B and __D
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_maskz_dpbf16_ps(__mmask8 __U, __m256 __D, __m256bh __A, __m256bh __B) {
return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
(__v8sf)_mm256_dpbf16_ps(__D, __A, __B),
(__v8sf)_mm256_setzero_si256());
}
/// Convert One Single float Data to One BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
///
/// \param __A
/// A float data.
/// \returns A bf16 data whose sign field and exponent field keep unchanged,
/// and fraction field is truncated to 7 bits.
static __inline__ __bfloat16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) {
__v4sf __V = {__A, 0, 0, 0};
__v8hi __R = __builtin_ia32_cvtneps2bf16_128_mask(
(__v4sf)__V, (__v8hi)_mm_undefined_si128(), (__mmask8)-1);
return __R[0];
}
/// Convert Packed BF16 Data to Packed float Data.
///
/// \headerfile <x86intrin.h>
///
/// \param __A
/// A 128-bit vector of [8 x bfloat].
/// \returns A 256-bit vector of [8 x float] come from convertion of __A
static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) {
return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
(__m256i)_mm256_cvtepi16_epi32((__m128i)__A), 16));
}
/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
///
/// \headerfile <x86intrin.h>
///
/// \param __U
/// A 8-bit mask. Elements are zeroed out when the corresponding mask
/// bit is not set.
/// \param __A
/// A 128-bit vector of [8 x bfloat].
/// \returns A 256-bit vector of [8 x float] come from convertion of __A
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
(__m256i)_mm256_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
}
/// Convert Packed BF16 Data to Packed float Data using merging mask.
///
/// \headerfile <x86intrin.h>
///
/// \param __S
/// A 256-bit vector of [8 x float]. Elements are copied from __S when
/// the corresponding mask bit is not set.
/// \param __U
/// A 8-bit mask. Elements are zeroed out when the corresponding mask
/// bit is not set.
/// \param __A
/// A 128-bit vector of [8 x bfloat].
/// \returns A 256-bit vector of [8 x float] come from convertion of __A
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) {
return _mm256_castsi256_ps((__m256i)_mm256_mask_slli_epi32(
(__m256i)__S, (__mmask8)__U, (__m256i)_mm256_cvtepi16_epi32((__m128i)__A),
16));
}
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
#endif
+3 -17
View File
@@ -1,23 +1,9 @@
/*===---- avx512vlbitalgintrin.h - BITALG intrinsics -----------------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+11 -25
View File
@@ -1,22 +1,8 @@
/*===---- avx512vlbwintrin.h - AVX512VL and AVX512BW intrinsics ------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
@@ -2301,7 +2287,7 @@ static __inline __m128i __DEFAULT_FN_ATTRS128
_mm_loadu_epi16 (void const *__P)
{
struct __loadu_epi16 {
__m128i __v;
__m128i_u __v;
} __attribute__((__packed__, __may_alias__));
return ((struct __loadu_epi16*)__P)->__v;
}
@@ -2327,7 +2313,7 @@ static __inline __m256i __DEFAULT_FN_ATTRS256
_mm256_loadu_epi16 (void const *__P)
{
struct __loadu_epi16 {
__m256i __v;
__m256i_u __v;
} __attribute__((__packed__, __may_alias__));
return ((struct __loadu_epi16*)__P)->__v;
}
@@ -2353,7 +2339,7 @@ static __inline __m128i __DEFAULT_FN_ATTRS128
_mm_loadu_epi8 (void const *__P)
{
struct __loadu_epi8 {
__m128i __v;
__m128i_u __v;
} __attribute__((__packed__, __may_alias__));
return ((struct __loadu_epi8*)__P)->__v;
}
@@ -2379,7 +2365,7 @@ static __inline __m256i __DEFAULT_FN_ATTRS256
_mm256_loadu_epi8 (void const *__P)
{
struct __loadu_epi8 {
__m256i __v;
__m256i_u __v;
} __attribute__((__packed__, __may_alias__));
return ((struct __loadu_epi8*)__P)->__v;
}
@@ -2405,7 +2391,7 @@ static __inline void __DEFAULT_FN_ATTRS128
_mm_storeu_epi16 (void *__P, __m128i __A)
{
struct __storeu_epi16 {
__m128i __v;
__m128i_u __v;
} __attribute__((__packed__, __may_alias__));
((struct __storeu_epi16*)__P)->__v = __A;
}
@@ -2422,7 +2408,7 @@ static __inline void __DEFAULT_FN_ATTRS256
_mm256_storeu_epi16 (void *__P, __m256i __A)
{
struct __storeu_epi16 {
__m256i __v;
__m256i_u __v;
} __attribute__((__packed__, __may_alias__));
((struct __storeu_epi16*)__P)->__v = __A;
}
@@ -2439,7 +2425,7 @@ static __inline void __DEFAULT_FN_ATTRS128
_mm_storeu_epi8 (void *__P, __m128i __A)
{
struct __storeu_epi8 {
__m128i __v;
__m128i_u __v;
} __attribute__((__packed__, __may_alias__));
((struct __storeu_epi8*)__P)->__v = __A;
}
@@ -2456,7 +2442,7 @@ static __inline void __DEFAULT_FN_ATTRS256
_mm256_storeu_epi8 (void *__P, __m256i __A)
{
struct __storeu_epi8 {
__m256i __v;
__m256i_u __v;
} __attribute__((__packed__, __may_alias__));
((struct __storeu_epi8*)__P)->__v = __A;
}
+31 -55
View File
@@ -1,22 +1,8 @@
/*===---- avx512vlcdintrin.h - AVX512VL and AVX512CD intrinsics ------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
@@ -60,99 +46,89 @@ _mm256_broadcastmw_epi32 (__mmask16 __A)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_conflict_epi64 (__m128i __A)
{
return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
(__v2di) _mm_undefined_si128 (),
(__mmask8) -1);
return (__m128i) __builtin_ia32_vpconflictdi_128 ((__v2di) __A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_conflict_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
{
return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
(__v2di) __W,
(__mmask8) __U);
return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
(__v2di)_mm_conflict_epi64(__A),
(__v2di)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_conflict_epi64 (__mmask8 __U, __m128i __A)
{
return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
(__v2di)
_mm_setzero_si128 (),
(__mmask8) __U);
return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
(__v2di)_mm_conflict_epi64(__A),
(__v2di)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_conflict_epi64 (__m256i __A)
{
return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
(__v4di) _mm256_undefined_si256 (),
(__mmask8) -1);
return (__m256i) __builtin_ia32_vpconflictdi_256 ((__v4di) __A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_conflict_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
{
return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
(__v4di) __W,
(__mmask8) __U);
return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
(__v4di)_mm256_conflict_epi64(__A),
(__v4di)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_conflict_epi64 (__mmask8 __U, __m256i __A)
{
return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
(__v4di) _mm256_setzero_si256 (),
(__mmask8) __U);
return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
(__v4di)_mm256_conflict_epi64(__A),
(__v4di)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_conflict_epi32 (__m128i __A)
{
return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
(__v4si) _mm_undefined_si128 (),
(__mmask8) -1);
return (__m128i) __builtin_ia32_vpconflictsi_128 ((__v4si) __A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_conflict_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
{
return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
(__v4si) __W,
(__mmask8) __U);
return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
(__v4si)_mm_conflict_epi32(__A),
(__v4si)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_conflict_epi32 (__mmask8 __U, __m128i __A)
{
return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
(__v4si) _mm_setzero_si128 (),
(__mmask8) __U);
return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
(__v4si)_mm_conflict_epi32(__A),
(__v4si)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_conflict_epi32 (__m256i __A)
{
return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
(__v8si) _mm256_undefined_si256 (),
(__mmask8) -1);
return (__m256i) __builtin_ia32_vpconflictsi_256 ((__v8si) __A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_conflict_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
{
return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
(__v8si) __W,
(__mmask8) __U);
return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
(__v8si)_mm256_conflict_epi32(__A),
(__v8si)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_conflict_epi32 (__mmask8 __U, __m256i __A)
{
return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
(__v8si)
_mm256_setzero_si256 (),
(__mmask8) __U);
return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
(__v8si)_mm256_conflict_epi32(__A),
(__v8si)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
+17 -35
View File
@@ -1,22 +1,8 @@
/*===---- avx512vldqintrin.h - AVX512VL and AVX512DQ intrinsics ------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
@@ -523,23 +509,21 @@ _mm_maskz_cvtepi64_ps (__mmask8 __U, __m128i __A) {
static __inline__ __m128 __DEFAULT_FN_ATTRS256
_mm256_cvtepi64_ps (__m256i __A) {
return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A,
(__v4sf) _mm_setzero_ps(),
(__mmask8) -1);
return (__m128)__builtin_convertvector((__v4di)__A, __v4sf);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS256
_mm256_mask_cvtepi64_ps (__m128 __W, __mmask8 __U, __m256i __A) {
return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A,
(__v4sf) __W,
(__mmask8) __U);
return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
(__v4sf)_mm256_cvtepi64_ps(__A),
(__v4sf)__W);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS256
_mm256_maskz_cvtepi64_ps (__mmask8 __U, __m256i __A) {
return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A,
(__v4sf) _mm_setzero_ps(),
(__mmask8) __U);
return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
(__v4sf)_mm256_cvtepi64_ps(__A),
(__v4sf)_mm_setzero_ps());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
@@ -771,23 +755,21 @@ _mm_maskz_cvtepu64_ps (__mmask8 __U, __m128i __A) {
static __inline__ __m128 __DEFAULT_FN_ATTRS256
_mm256_cvtepu64_ps (__m256i __A) {
return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A,
(__v4sf) _mm_setzero_ps(),
(__mmask8) -1);
return (__m128)__builtin_convertvector((__v4du)__A, __v4sf);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS256
_mm256_mask_cvtepu64_ps (__m128 __W, __mmask8 __U, __m256i __A) {
return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A,
(__v4sf) __W,
(__mmask8) __U);
return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
(__v4sf)_mm256_cvtepu64_ps(__A),
(__v4sf)__W);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS256
_mm256_maskz_cvtepu64_ps (__mmask8 __U, __m256i __A) {
return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A,
(__v4sf) _mm_setzero_ps(),
(__mmask8) __U);
return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
(__v4sf)_mm256_cvtepu64_ps(__A),
(__v4sf)_mm_setzero_ps());
}
#define _mm_range_pd(A, B, C) \
+19 -58
View File
@@ -1,22 +1,8 @@
/*===---- avx512vlintrin.h - AVX512VL intrinsics ---------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
@@ -5513,7 +5499,7 @@ static __inline __m128i __DEFAULT_FN_ATTRS128
_mm_loadu_epi64 (void const *__P)
{
struct __loadu_epi64 {
__m128i __v;
__m128i_u __v;
} __attribute__((__packed__, __may_alias__));
return ((struct __loadu_epi64*)__P)->__v;
}
@@ -5539,7 +5525,7 @@ static __inline __m256i __DEFAULT_FN_ATTRS256
_mm256_loadu_epi64 (void const *__P)
{
struct __loadu_epi64 {
__m256i __v;
__m256i_u __v;
} __attribute__((__packed__, __may_alias__));
return ((struct __loadu_epi64*)__P)->__v;
}
@@ -5565,7 +5551,7 @@ static __inline __m128i __DEFAULT_FN_ATTRS128
_mm_loadu_epi32 (void const *__P)
{
struct __loadu_epi32 {
__m128i __v;
__m128i_u __v;
} __attribute__((__packed__, __may_alias__));
return ((struct __loadu_epi32*)__P)->__v;
}
@@ -5591,7 +5577,7 @@ static __inline __m256i __DEFAULT_FN_ATTRS256
_mm256_loadu_epi32 (void const *__P)
{
struct __loadu_epi32 {
__m256i __v;
__m256i_u __v;
} __attribute__((__packed__, __may_alias__));
return ((struct __loadu_epi32*)__P)->__v;
}
@@ -5717,7 +5703,7 @@ static __inline void __DEFAULT_FN_ATTRS128
_mm_storeu_epi64 (void *__P, __m128i __A)
{
struct __storeu_epi64 {
__m128i __v;
__m128i_u __v;
} __attribute__((__packed__, __may_alias__));
((struct __storeu_epi64*)__P)->__v = __A;
}
@@ -5734,7 +5720,7 @@ static __inline void __DEFAULT_FN_ATTRS256
_mm256_storeu_epi64 (void *__P, __m256i __A)
{
struct __storeu_epi64 {
__m256i __v;
__m256i_u __v;
} __attribute__((__packed__, __may_alias__));
((struct __storeu_epi64*)__P)->__v = __A;
}
@@ -5751,7 +5737,7 @@ static __inline void __DEFAULT_FN_ATTRS128
_mm_storeu_epi32 (void *__P, __m128i __A)
{
struct __storeu_epi32 {
__m128i __v;
__m128i_u __v;
} __attribute__((__packed__, __may_alias__));
((struct __storeu_epi32*)__P)->__v = __A;
}
@@ -5768,7 +5754,7 @@ static __inline void __DEFAULT_FN_ATTRS256
_mm256_storeu_epi32 (void *__P, __m256i __A)
{
struct __storeu_epi32 {
__m256i __v;
__m256i_u __v;
} __attribute__((__packed__, __may_alias__));
((struct __storeu_epi32*)__P)->__v = __A;
}
@@ -7000,7 +6986,7 @@ _mm_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
__builtin_ia32_pmovsdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
static __inline__ __m128i __DEFAULT_FN_ATTRS256
_mm256_cvtsepi32_epi8 (__m256i __A)
{
return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A,
@@ -7023,7 +7009,7 @@ _mm256_maskz_cvtsepi32_epi8 (__mmask8 __M, __m256i __A)
__M);
}
static __inline__ void __DEFAULT_FN_ATTRS128
static __inline__ void __DEFAULT_FN_ATTRS256
_mm256_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
{
__builtin_ia32_pmovsdb256mem_mask ((__v16qi *) __P, (__v8si) __A, __M);
@@ -7581,7 +7567,7 @@ _mm_maskz_cvtepi32_epi8 (__mmask8 __M, __m128i __A)
__M);
}
static __inline__ void __DEFAULT_FN_ATTRS256
static __inline__ void __DEFAULT_FN_ATTRS128
_mm_mask_cvtepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
{
__builtin_ia32_pmovdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M);
@@ -8425,22 +8411,6 @@ _mm256_maskz_cvtph_ps (__mmask8 __U, __m128i __A)
(__mmask8) __U);
}
static __inline __m128i __DEFAULT_FN_ATTRS128
_mm_mask_cvtps_ph (__m128i __W, __mmask8 __U, __m128 __A)
{
return (__m128i) __builtin_ia32_vcvtps2ph_mask ((__v4sf) __A, _MM_FROUND_CUR_DIRECTION,
(__v8hi) __W,
(__mmask8) __U);
}
static __inline __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_cvtps_ph (__mmask8 __U, __m128 __A)
{
return (__m128i) __builtin_ia32_vcvtps2ph_mask ((__v4sf) __A, _MM_FROUND_CUR_DIRECTION,
(__v8hi) _mm_setzero_si128 (),
(__mmask8) __U);
}
#define _mm_mask_cvt_roundps_ph(W, U, A, I) \
(__m128i)__builtin_ia32_vcvtps2ph_mask((__v4sf)(__m128)(A), (int)(I), \
(__v8hi)(__m128i)(W), \
@@ -8451,21 +8421,9 @@ _mm_maskz_cvtps_ph (__mmask8 __U, __m128 __A)
(__v8hi)_mm_setzero_si128(), \
(__mmask8)(U))
static __inline __m128i __DEFAULT_FN_ATTRS256
_mm256_mask_cvtps_ph (__m128i __W, __mmask8 __U, __m256 __A)
{
return (__m128i) __builtin_ia32_vcvtps2ph256_mask ((__v8sf) __A, _MM_FROUND_CUR_DIRECTION,
(__v8hi) __W,
(__mmask8) __U);
}
#define _mm_mask_cvtps_ph _mm_mask_cvt_roundps_ph
#define _mm_maskz_cvtps_ph _mm_maskz_cvt_roundps_ph
static __inline __m128i __DEFAULT_FN_ATTRS256
_mm256_maskz_cvtps_ph ( __mmask8 __U, __m256 __A)
{
return (__m128i) __builtin_ia32_vcvtps2ph256_mask ((__v8sf) __A, _MM_FROUND_CUR_DIRECTION,
(__v8hi) _mm_setzero_si128(),
(__mmask8) __U);
}
#define _mm256_mask_cvt_roundps_ph(W, U, A, I) \
(__m128i)__builtin_ia32_vcvtps2ph256_mask((__v8sf)(__m256)(A), (int)(I), \
(__v8hi)(__m128i)(W), \
@@ -8476,6 +8434,9 @@ _mm256_maskz_cvtps_ph ( __mmask8 __U, __m256 __A)
(__v8hi)_mm_setzero_si128(), \
(__mmask8)(U))
#define _mm256_mask_cvtps_ph _mm256_mask_cvt_roundps_ph
#define _mm256_maskz_cvtps_ph _mm256_maskz_cvt_roundps_ph
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
+3 -17
View File
@@ -1,23 +1,9 @@
/*===------------- avx512vlvbmi2intrin.h - VBMI2 intrinsics -----------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+3 -17
View File
@@ -1,23 +1,9 @@
/*===------------- avx512vlvnniintrin.h - VNNI intrinsics ------------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+121
View File
@@ -0,0 +1,121 @@
/*===------ avx512vlvp2intersectintrin.h - VL VP2INTERSECT intrinsics ------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512vlvp2intersectintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef _AVX512VLVP2INTERSECT_H
#define _AVX512VLVP2INTERSECT_H
#define __DEFAULT_FN_ATTRS128 \
__attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vp2intersect"), \
__min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 \
__attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vp2intersect"), \
__min_vector_width__(256)))
/// Store, in an even/odd pair of mask registers, the indicators of the
/// locations of value matches between dwords in operands __a and __b.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VP2INTERSECTD </c> instruction.
///
/// \param __a
/// A 256-bit vector of [8 x i32].
/// \param __b
/// A 256-bit vector of [8 x i32]
/// \param __m0
/// A pointer point to 8-bit mask
/// \param __m1
/// A pointer point to 8-bit mask
static __inline__ void __DEFAULT_FN_ATTRS256
_mm256_2intersect_epi32(__m256i __a, __m256i __b, __mmask8 *__m0, __mmask8 *__m1) {
__builtin_ia32_vp2intersect_d_256((__v8si)__a, (__v8si)__b, __m0, __m1);
}
/// Store, in an even/odd pair of mask registers, the indicators of the
/// locations of value matches between quadwords in operands __a and __b.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VP2INTERSECTQ </c> instruction.
///
/// \param __a
/// A 256-bit vector of [4 x i64].
/// \param __b
/// A 256-bit vector of [4 x i64]
/// \param __m0
/// A pointer point to 8-bit mask
/// \param __m1
/// A pointer point to 8-bit mask
static __inline__ void __DEFAULT_FN_ATTRS256
_mm256_2intersect_epi64(__m256i __a, __m256i __b, __mmask8 *__m0, __mmask8 *__m1) {
__builtin_ia32_vp2intersect_q_256((__v4di)__a, (__v4di)__b, __m0, __m1);
}
/// Store, in an even/odd pair of mask registers, the indicators of the
/// locations of value matches between dwords in operands __a and __b.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VP2INTERSECTD </c> instruction.
///
/// \param __a
/// A 128-bit vector of [4 x i32].
/// \param __b
/// A 128-bit vector of [4 x i32]
/// \param __m0
/// A pointer point to 8-bit mask
/// \param __m1
/// A pointer point to 8-bit mask
static __inline__ void __DEFAULT_FN_ATTRS128
_mm_2intersect_epi32(__m128i __a, __m128i __b, __mmask8 *__m0, __mmask8 *__m1) {
__builtin_ia32_vp2intersect_d_128((__v4si)__a, (__v4si)__b, __m0, __m1);
}
/// Store, in an even/odd pair of mask registers, the indicators of the
/// locations of value matches between quadwords in operands __a and __b.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VP2INTERSECTQ </c> instruction.
///
/// \param __a
/// A 128-bit vector of [2 x i64].
/// \param __b
/// A 128-bit vector of [2 x i64]
/// \param __m0
/// A pointer point to 8-bit mask
/// \param __m1
/// A pointer point to 8-bit mask
static __inline__ void __DEFAULT_FN_ATTRS128
_mm_2intersect_epi64(__m128i __a, __m128i __b, __mmask8 *__m0, __mmask8 *__m1) {
__builtin_ia32_vp2intersect_q_128((__v2di)__a, (__v2di)__b, __m0, __m1);
}
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
#endif
+3 -17
View File
@@ -1,23 +1,9 @@
/*===------------- avx512vnniintrin.h - VNNI intrinsics ------------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+77
View File
@@ -0,0 +1,77 @@
/*===------- avx512vpintersectintrin.h - VP2INTERSECT intrinsics ------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512vp2intersect.h> directly; include <immintrin.h> instead."
#endif
#ifndef _AVX512VP2INTERSECT_H
#define _AVX512VP2INTERSECT_H
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("avx512vp2intersect"), \
__min_vector_width__(512)))
/// Store, in an even/odd pair of mask registers, the indicators of the
/// locations of value matches between dwords in operands __a and __b.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VP2INTERSECTD </c> instruction.
///
/// \param __a
/// A 512-bit vector of [16 x i32].
/// \param __b
/// A 512-bit vector of [16 x i32]
/// \param __m0
/// A pointer point to 16-bit mask
/// \param __m1
/// A pointer point to 16-bit mask
static __inline__ void __DEFAULT_FN_ATTRS
_mm512_2intersect_epi32(__m512i __a, __m512i __b, __mmask16 *__m0, __mmask16 *__m1) {
__builtin_ia32_vp2intersect_d_512((__v16si)__a, (__v16si)__b, __m0, __m1);
}
/// Store, in an even/odd pair of mask registers, the indicators of the
/// locations of value matches between quadwords in operands __a and __b.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VP2INTERSECTQ </c> instruction.
///
/// \param __a
/// A 512-bit vector of [8 x i64].
/// \param __b
/// A 512-bit vector of [8 x i64]
/// \param __m0
/// A pointer point to 8-bit mask
/// \param __m1
/// A pointer point to 8-bit mask
static __inline__ void __DEFAULT_FN_ATTRS
_mm512_2intersect_epi64(__m512i __a, __m512i __b, __mmask8 *__m0, __mmask8 *__m1) {
__builtin_ia32_vp2intersect_q_512((__v8di)__a, (__v8di)__b, __m0, __m1);
}
#undef __DEFAULT_FN_ATTRS
#endif
+3 -17
View File
@@ -1,23 +1,9 @@
/*===----- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics-------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+3 -17
View File
@@ -1,23 +1,9 @@
/*===---- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics -------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+20 -30
View File
@@ -1,22 +1,8 @@
/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
@@ -45,9 +31,13 @@ typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
* appear in the interface though. */
typedef signed char __v32qs __attribute__((__vector_size__(32)));
typedef float __m256 __attribute__ ((__vector_size__ (32)));
typedef double __m256d __attribute__((__vector_size__(32)));
typedef long long __m256i __attribute__((__vector_size__(32)));
typedef float __m256 __attribute__ ((__vector_size__ (32), __aligned__(32)));
typedef double __m256d __attribute__((__vector_size__(32), __aligned__(32)));
typedef long long __m256i __attribute__((__vector_size__(32), __aligned__(32)));
typedef float __m256_u __attribute__ ((__vector_size__ (32), __aligned__(1)));
typedef double __m256d_u __attribute__((__vector_size__(32), __aligned__(1)));
typedef long long __m256i_u __attribute__((__vector_size__(32), __aligned__(1)));
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(256)))
@@ -3113,7 +3103,7 @@ static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_loadu_pd(double const *__p)
{
struct __loadu_pd {
__m256d __v;
__m256d_u __v;
} __attribute__((__packed__, __may_alias__));
return ((struct __loadu_pd*)__p)->__v;
}
@@ -3133,7 +3123,7 @@ static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_loadu_ps(float const *__p)
{
struct __loadu_ps {
__m256 __v;
__m256_u __v;
} __attribute__((__packed__, __may_alias__));
return ((struct __loadu_ps*)__p)->__v;
}
@@ -3166,10 +3156,10 @@ _mm256_load_si256(__m256i const *__p)
/// A pointer to a 256-bit integer vector containing integer values.
/// \returns A 256-bit integer vector containing the moved values.
static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_loadu_si256(__m256i const *__p)
_mm256_loadu_si256(__m256i_u const *__p)
{
struct __loadu_si256 {
__m256i __v;
__m256i_u __v;
} __attribute__((__packed__, __may_alias__));
return ((struct __loadu_si256*)__p)->__v;
}
@@ -3246,7 +3236,7 @@ static __inline void __DEFAULT_FN_ATTRS
_mm256_storeu_pd(double *__p, __m256d __a)
{
struct __storeu_pd {
__m256d __v;
__m256d_u __v;
} __attribute__((__packed__, __may_alias__));
((struct __storeu_pd*)__p)->__v = __a;
}
@@ -3266,7 +3256,7 @@ static __inline void __DEFAULT_FN_ATTRS
_mm256_storeu_ps(float *__p, __m256 __a)
{
struct __storeu_ps {
__m256 __v;
__m256_u __v;
} __attribute__((__packed__, __may_alias__));
((struct __storeu_ps*)__p)->__v = __a;
}
@@ -3301,10 +3291,10 @@ _mm256_store_si256(__m256i *__p, __m256i __a)
/// \param __a
/// A 256-bit integer vector containing the values to be moved.
static __inline void __DEFAULT_FN_ATTRS
_mm256_storeu_si256(__m256i *__p, __m256i __a)
_mm256_storeu_si256(__m256i_u *__p, __m256i __a)
{
struct __storeu_si256 {
__m256i __v;
__m256i_u __v;
} __attribute__((__packed__, __may_alias__));
((struct __storeu_si256*)__p)->__v = __a;
}
@@ -4834,7 +4824,7 @@ _mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
/// address of the memory location does not have to be aligned.
/// \returns A 256-bit integer vector containing the concatenated result.
static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo)
_mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo)
{
__m256i __v256 = _mm256_castsi128_si256(_mm_loadu_si128(__addr_lo));
return _mm256_insertf128_si256(__v256, _mm_loadu_si128(__addr_hi), 1);
@@ -4918,7 +4908,7 @@ _mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
/// \param __a
/// A 256-bit integer vector.
static __inline void __DEFAULT_FN_ATTRS
_mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a)
_mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a)
{
__m128i __v128;
+3 -17
View File
@@ -1,22 +1,8 @@
/*===---- bmi2intrin.h - BMI2 intrinsics -----------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+3 -17
View File
@@ -1,22 +1,8 @@
/*===---- bmiintrin.h - BMI intrinsics -------------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+3 -17
View File
@@ -1,22 +1,8 @@
/*===---- cetintrin.h - CET intrinsic --------------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+3 -17
View File
@@ -1,22 +1,8 @@
/*===---- cldemoteintrin.h - CLDEMOTE intrinsic ----------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+3 -17
View File
@@ -1,22 +1,8 @@
/*===---- clflushoptintrin.h - CLFLUSHOPT intrinsic ------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+3 -17
View File
@@ -1,22 +1,8 @@
/*===---- clwbintrin.h - CLWB intrinsic ------------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+3 -17
View File
@@ -1,22 +1,8 @@
/*===----------------------- clzerointrin.h - CLZERO ----------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+7 -17
View File
@@ -1,22 +1,8 @@
/*===---- cpuid.h - X86 cpu model detection --------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
@@ -191,6 +177,7 @@
#define bit_CLDEMOTE 0x02000000
#define bit_MOVDIRI 0x08000000
#define bit_MOVDIR64B 0x10000000
#define bit_ENQCMD 0x20000000
/* Features in %edx for leaf 7 sub-leaf 0 */
#define bit_AVX5124VNNIW 0x00000004
@@ -198,6 +185,9 @@
#define bit_PCONFIG 0x00040000
#define bit_IBT 0x00100000
/* Features in %eax for leaf 7 sub-leaf 1 */
#define bit_AVX512BF16 0x00000020
/* Features in %eax for leaf 13 sub-leaf 1 */
#define bit_XSAVEOPT 0x00000001
#define bit_XSAVEC 0x00000002
+18 -37
View File
@@ -1,22 +1,8 @@
/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
@@ -26,8 +12,11 @@
#include <xmmintrin.h>
typedef double __m128d __attribute__((__vector_size__(16)));
typedef long long __m128i __attribute__((__vector_size__(16)));
typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1)));
typedef long long __m128i_u __attribute__((__vector_size__(16), __aligned__(1)));
/* Type defines. */
typedef double __v2df __attribute__ ((__vector_size__ (16)));
@@ -1652,7 +1641,7 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_loadu_pd(double const *__dp)
{
struct __loadu_pd {
__m128d __v;
__m128d_u __v;
} __attribute__((__packed__, __may_alias__));
return ((struct __loadu_pd*)__dp)->__v;
}
@@ -2042,7 +2031,7 @@ static __inline__ void __DEFAULT_FN_ATTRS
_mm_storeu_pd(double *__dp, __m128d __a)
{
struct __storeu_pd {
__m128d __v;
__m128d_u __v;
} __attribute__((__packed__, __may_alias__));
((struct __storeu_pd*)__dp)->__v = __a;
}
@@ -2316,11 +2305,7 @@ _mm_adds_epu16(__m128i __a, __m128i __b)
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_avg_epu8(__m128i __a, __m128i __b)
{
typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
return (__m128i)__builtin_convertvector(
((__builtin_convertvector((__v16qu)__a, __v16hu) +
__builtin_convertvector((__v16qu)__b, __v16hu)) + 1)
>> 1, __v16qu);
return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
}
/// Computes the rounded avarages of corresponding elements of two
@@ -2340,11 +2325,7 @@ _mm_avg_epu8(__m128i __a, __m128i __b)
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_avg_epu16(__m128i __a, __m128i __b)
{
typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
return (__m128i)__builtin_convertvector(
((__builtin_convertvector((__v8hu)__a, __v8su) +
__builtin_convertvector((__v8hu)__b, __v8su)) + 1)
>> 1, __v8hu);
return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
}
/// Multiplies the corresponding elements of two 128-bit signed [8 x i16]
@@ -3564,10 +3545,10 @@ _mm_load_si128(__m128i const *__p)
/// A pointer to a memory location containing integer values.
/// \returns A 128-bit integer vector containing the moved values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_loadu_si128(__m128i const *__p)
_mm_loadu_si128(__m128i_u const *__p)
{
struct __loadu_si128 {
__m128i __v;
__m128i_u __v;
} __attribute__((__packed__, __may_alias__));
return ((struct __loadu_si128*)__p)->__v;
}
@@ -3585,7 +3566,7 @@ _mm_loadu_si128(__m128i const *__p)
/// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
/// moved value. The higher order bits are cleared.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_loadl_epi64(__m128i const *__p)
_mm_loadl_epi64(__m128i_u const *__p)
{
struct __mm_loadl_epi64_struct {
long long __u;
@@ -4027,10 +4008,10 @@ _mm_store_si128(__m128i *__p, __m128i __b)
/// \param __b
/// A 128-bit integer vector containing the values to be moved.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_storeu_si128(__m128i *__p, __m128i __b)
_mm_storeu_si128(__m128i_u *__p, __m128i __b)
{
struct __storeu_si128 {
__m128i __v;
__m128i_u __v;
} __attribute__((__packed__, __may_alias__));
((struct __storeu_si128*)__p)->__v = __b;
}
@@ -4139,7 +4120,7 @@ _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
/// A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
/// value to be stored.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_storel_epi64(__m128i *__p, __m128i __a)
_mm_storel_epi64(__m128i_u *__p, __m128i __a)
{
struct __mm_storel_epi64_struct {
long long __u;
+63
View File
@@ -0,0 +1,63 @@
/*===------------------ enqcmdintrin.h - enqcmd intrinsics -----------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <enqcmdintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __ENQCMDINTRIN_H
#define __ENQCMDINTRIN_H
/* Define the default attributes for the functions in this file */
#define _DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("enqcmd")))
/// Reads 64-byte command pointed by \a __src, formats 64-byte enqueue store
/// data, and performs 64-byte enqueue store to memory pointed by \a __dst.
/// This intrinsics may only be used in User mode.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsics corresponds to the <c> ENQCMD </c> instruction.
///
/// \param __dst
/// Pointer to the destination of the enqueue store.
/// \param __src
/// Pointer to 64-byte command data.
/// \returns If the command data is successfully written to \a __dst then 0 is
/// returned. Otherwise 1 is returned.
static __inline__ int _DEFAULT_FN_ATTRS
_enqcmd (void *__dst, const void *__src)
{
return __builtin_ia32_enqcmd(__dst, __src);
}
/// Reads 64-byte command pointed by \a __src, formats 64-byte enqueue store
/// data, and performs 64-byte enqueue store to memory pointed by \a __dst
/// This intrinsic may only be used in Privileged mode.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsics corresponds to the <c> ENQCMDS </c> instruction.
///
/// \param __dst
/// Pointer to the destination of the enqueue store.
/// \param __src
/// Pointer to 64-byte command data.
/// \returns If the command data is successfully written to \a __dst then 0 is
/// returned. Otherwise 1 is returned.
static __inline__ int _DEFAULT_FN_ATTRS
_enqcmds (void *__dst, const void *__src)
{
return __builtin_ia32_enqcmds(__dst, __src);
}
#undef _DEFAULT_FN_ATTRS
#endif /* __ENQCMDINTRIN_H */
+6 -20
View File
@@ -1,22 +1,8 @@
/*===---- f16cintrin.h - F16C intrinsics -----------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
@@ -52,9 +38,9 @@
static __inline float __DEFAULT_FN_ATTRS128
_cvtsh_ss(unsigned short __a)
{
__v8hi v = {(short)__a, 0, 0, 0, 0, 0, 0, 0};
__v4sf r = __builtin_ia32_vcvtph2ps(v);
return r[0];
__v8hi __v = {(short)__a, 0, 0, 0, 0, 0, 0, 0};
__v4sf __r = __builtin_ia32_vcvtph2ps(__v);
return __r[0];
}
/// Converts a 32-bit single-precision float value to a 16-bit
+7 -21
View File
@@ -1,22 +1,8 @@
/*===---- float.h - Characteristics of floating point types ----------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
@@ -51,7 +37,7 @@
# undef FLT_MANT_DIG
# undef DBL_MANT_DIG
# undef LDBL_MANT_DIG
# if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__)
# if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__) || __cplusplus >= 201103L
# undef DECIMAL_DIG
# endif
# undef FLT_DIG
@@ -78,7 +64,7 @@
# undef FLT_MIN
# undef DBL_MIN
# undef LDBL_MIN
# if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__)
# if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__) || __cplusplus >= 201703L
# undef FLT_TRUE_MIN
# undef DBL_TRUE_MIN
# undef LDBL_TRUE_MIN
@@ -101,7 +87,7 @@
#define DBL_MANT_DIG __DBL_MANT_DIG__
#define LDBL_MANT_DIG __LDBL_MANT_DIG__
#if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__)
#if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__) || __cplusplus >= 201103L
# define DECIMAL_DIG __DECIMAL_DIG__
#endif
@@ -137,7 +123,7 @@
#define DBL_MIN __DBL_MIN__
#define LDBL_MIN __LDBL_MIN__
#if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__)
#if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__) || __cplusplus >= 201703L
# define FLT_TRUE_MIN __FLT_DENORM_MIN__
# define DBL_TRUE_MIN __DBL_DENORM_MIN__
# define LDBL_TRUE_MIN __LDBL_DENORM_MIN__
+3 -17
View File
@@ -1,22 +1,8 @@
/*===---- fma4intrin.h - FMA4 intrinsics -----------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+3 -17
View File
@@ -1,22 +1,8 @@
/*===---- fmaintrin.h - FMA intrinsics -------------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+3 -17
View File
@@ -1,22 +1,8 @@
/*===---- fxsrintrin.h - FXSR intrinsic ------------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+3 -17
View File
@@ -1,23 +1,9 @@
/*===----------------- gfniintrin.h - GFNI intrinsics ----------------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+3 -17
View File
@@ -1,22 +1,8 @@
/*===---- htmintrin.h - Standard header for PowerPC HTM ---------------===*\
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
\*===----------------------------------------------------------------------===*/
+3 -17
View File
@@ -1,22 +1,8 @@
/*===---- htmxlintrin.h - XL compiler HTM execution intrinsics-------------===*\
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
\*===----------------------------------------------------------------------===*/
+303 -17
View File
@@ -1,22 +1,8 @@
/* ===-------- ia32intrin.h ---------------------------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
@@ -28,6 +14,160 @@
#ifndef __IA32INTRIN_H
#define __IA32INTRIN_H
/** Find the first set bit starting from the lsb. Result is undefined if
* input is 0.
*
* \headerfile <x86intrin.h>
*
* This intrinsic corresponds to the <c> BSF </c> instruction or the
* <c> TZCNT </c> instruction.
*
* \param __A
* A 32-bit integer operand.
* \returns A 32-bit integer containing the bit number.
*/
static __inline__ int __attribute__((__always_inline__, __nodebug__))
__bsfd(int __A) {
return __builtin_ctz(__A);
}
/** Find the first set bit starting from the msb. Result is undefined if
* input is 0.
*
* \headerfile <x86intrin.h>
*
* This intrinsic corresponds to the <c> BSR </c> instruction or the
* <c> LZCNT </c> instruction and an <c> XOR </c>.
*
* \param __A
* A 32-bit integer operand.
* \returns A 32-bit integer containing the bit number.
*/
static __inline__ int __attribute__((__always_inline__, __nodebug__))
__bsrd(int __A) {
return 31 - __builtin_clz(__A);
}
/** Swaps the bytes in the input. Converting little endian to big endian or
* vice versa.
*
* \headerfile <x86intrin.h>
*
* This intrinsic corresponds to the <c> BSWAP </c> instruction.
*
* \param __A
* A 32-bit integer operand.
* \returns A 32-bit integer containing the swapped bytes.
*/
static __inline__ int __attribute__((__always_inline__, __nodebug__))
__bswapd(int __A) {
return __builtin_bswap32(__A);
}
static __inline__ int __attribute__((__always_inline__, __nodebug__))
_bswap(int __A) {
return __builtin_bswap32(__A);
}
#define _bit_scan_forward(A) __bsfd((A))
#define _bit_scan_reverse(A) __bsrd((A))
#ifdef __x86_64__
/** Find the first set bit starting from the lsb. Result is undefined if
* input is 0.
*
* \headerfile <x86intrin.h>
*
* This intrinsic corresponds to the <c> BSF </c> instruction or the
* <c> TZCNT </c> instruction.
*
* \param __A
* A 64-bit integer operand.
* \returns A 32-bit integer containing the bit number.
*/
static __inline__ int __attribute__((__always_inline__, __nodebug__))
__bsfq(long long __A) {
return __builtin_ctzll(__A);
}
/** Find the first set bit starting from the msb. Result is undefined if
* input is 0.
*
* \headerfile <x86intrin.h>
*
* This intrinsic corresponds to the <c> BSR </c> instruction or the
* <c> LZCNT </c> instruction and an <c> XOR </c>.
*
* \param __A
* A 64-bit integer operand.
* \returns A 32-bit integer containing the bit number.
*/
static __inline__ int __attribute__((__always_inline__, __nodebug__))
__bsrq(long long __A) {
return 63 - __builtin_clzll(__A);
}
/** Swaps the bytes in the input. Converting little endian to big endian or
* vice versa.
*
* \headerfile <x86intrin.h>
*
* This intrinsic corresponds to the <c> BSWAP </c> instruction.
*
* \param __A
* A 64-bit integer operand.
* \returns A 64-bit integer containing the swapped bytes.
*/
static __inline__ long long __attribute__((__always_inline__, __nodebug__))
__bswapq(long long __A) {
return __builtin_bswap64(__A);
}
#define _bswap64(A) __bswapq((A))
#endif
/** Counts the number of bits in the source operand having a value of 1.
*
* \headerfile <x86intrin.h>
*
* This intrinsic corresponds to the <c> POPCNT </c> instruction or a
* a sequence of arithmetic and logic ops to calculate it.
*
* \param __A
* An unsigned 32-bit integer operand.
* \returns A 32-bit integer containing the number of bits with value 1 in the
* source operand.
*/
static __inline__ int __attribute__((__always_inline__, __nodebug__))
__popcntd(unsigned int __A)
{
return __builtin_popcount(__A);
}
#define _popcnt32(A) __popcntd((A))
#ifdef __x86_64__
/** Counts the number of bits in the source operand having a value of 1.
*
* \headerfile <x86intrin.h>
*
* This intrinsic corresponds to the <c> POPCNT </c> instruction or a
* a sequence of arithmetic and logic ops to calculate it.
*
* \param __A
* An unsigned 64-bit integer operand.
* \returns A 64-bit integer containing the number of bits with value 1 in the
* source operand.
*/
static __inline__ long long __attribute__((__always_inline__, __nodebug__))
__popcntq(unsigned long long __A)
{
return __builtin_popcountll(__A);
}
#define _popcnt64(A) __popcntq((A))
#endif /* __x86_64__ */
#ifdef __x86_64__
static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
__readeflags(void)
@@ -55,6 +195,92 @@ __writeeflags(unsigned int __f)
}
#endif /* !__x86_64__ */
/** Adds the unsigned integer operand to the CRC-32C checksum of the
* unsigned char operand.
*
* \headerfile <x86intrin.h>
*
* This intrinsic corresponds to the <c> CRC32B </c> instruction.
*
* \param __C
* An unsigned integer operand to add to the CRC-32C checksum of operand
* \a __D.
* \param __D
* An unsigned 8-bit integer operand used to compute the CRC-32C checksum.
* \returns The result of adding operand \a __C to the CRC-32C checksum of
* operand \a __D.
*/
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
__crc32b(unsigned int __C, unsigned char __D)
{
return __builtin_ia32_crc32qi(__C, __D);
}
/** Adds the unsigned integer operand to the CRC-32C checksum of the
* unsigned short operand.
*
* \headerfile <x86intrin.h>
*
* This intrinsic corresponds to the <c> CRC32W </c> instruction.
*
* \param __C
* An unsigned integer operand to add to the CRC-32C checksum of operand
* \a __D.
* \param __D
* An unsigned 16-bit integer operand used to compute the CRC-32C checksum.
* \returns The result of adding operand \a __C to the CRC-32C checksum of
* operand \a __D.
*/
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
__crc32w(unsigned int __C, unsigned short __D)
{
return __builtin_ia32_crc32hi(__C, __D);
}
/** Adds the unsigned integer operand to the CRC-32C checksum of the
* second unsigned integer operand.
*
* \headerfile <x86intrin.h>
*
* This intrinsic corresponds to the <c> CRC32D </c> instruction.
*
* \param __C
* An unsigned integer operand to add to the CRC-32C checksum of operand
* \a __D.
* \param __D
* An unsigned 32-bit integer operand used to compute the CRC-32C checksum.
* \returns The result of adding operand \a __C to the CRC-32C checksum of
* operand \a __D.
*/
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
__crc32d(unsigned int __C, unsigned int __D)
{
return __builtin_ia32_crc32si(__C, __D);
}
#ifdef __x86_64__
/** Adds the unsigned integer operand to the CRC-32C checksum of the
* unsigned 64-bit integer operand.
*
* \headerfile <x86intrin.h>
*
* This intrinsic corresponds to the <c> CRC32Q </c> instruction.
*
* \param __C
* An unsigned integer operand to add to the CRC-32C checksum of operand
* \a __D.
* \param __D
* An unsigned 64-bit integer operand used to compute the CRC-32C checksum.
* \returns The result of adding operand \a __C to the CRC-32C checksum of
* operand \a __D.
*/
static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
__crc32q(unsigned long long __C, unsigned long long __D)
{
return __builtin_ia32_crc32di(__C, __D);
}
#endif /* __x86_64__ */
static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
__rdpmc(int __A) {
return __builtin_ia32_rdpmc(__A);
@@ -75,4 +301,64 @@ _wbinvd(void) {
__builtin_ia32_wbinvd();
}
static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
__rolb(unsigned char __X, int __C) {
return __builtin_rotateleft8(__X, __C);
}
static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
__rorb(unsigned char __X, int __C) {
return __builtin_rotateright8(__X, __C);
}
static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__))
__rolw(unsigned short __X, int __C) {
return __builtin_rotateleft16(__X, __C);
}
static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__))
__rorw(unsigned short __X, int __C) {
return __builtin_rotateright16(__X, __C);
}
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
__rold(unsigned int __X, int __C) {
return __builtin_rotateleft32(__X, __C);
}
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
__rord(unsigned int __X, int __C) {
return __builtin_rotateright32(__X, __C);
}
#ifdef __x86_64__
static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
__rolq(unsigned long long __X, int __C) {
return __builtin_rotateleft64(__X, __C);
}
static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
__rorq(unsigned long long __X, int __C) {
return __builtin_rotateright64(__X, __C);
}
#endif /* __x86_64__ */
#ifndef _MSC_VER
/* These are already provided as builtins for MSVC. */
/* Select the correct function based on the size of long. */
#ifdef __LP64__
#define _lrotl(a,b) __rolq((a), (b))
#define _lrotr(a,b) __rorq((a), (b))
#else
#define _lrotl(a,b) __rold((a), (b))
#define _lrotr(a,b) __rord((a), (b))
#endif
#define _rotl(a,b) __rold((a), (b))
#define _rotr(a,b) __rord((a), (b))
#endif // _MSC_VER
/* These are not builtins so need to be provided in all modes. */
#define _rotwl(a,b) __rolw((a), (b))
#define _rotwr(a,b) __rorw((a), (b))
#endif /* __IA32INTRIN_H */
+29 -33
View File
@@ -1,22 +1,8 @@
/*===---- immintrin.h - Intel intrinsics -----------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
@@ -195,6 +181,15 @@
#include <avx512pfintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512BF16__)
#include <avx512bf16intrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || \
(defined(__AVX512VL__) && defined(__AVX512BF16__))
#include <avx512vlbf16intrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__PKU__)
#include <pkuintrin.h>
#endif
@@ -241,18 +236,6 @@ _rdrand64_step(unsigned long long *__p)
#endif
#endif /* __RDRND__ */
/* __bit_scan_forward */
static __inline__ int __attribute__((__always_inline__, __nodebug__))
_bit_scan_forward(int __A) {
return __builtin_ctz(__A);
}
/* __bit_scan_reverse */
static __inline__ int __attribute__((__always_inline__, __nodebug__))
_bit_scan_reverse(int __A) {
return 31 - __builtin_clz(__A);
}
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__FSGSBASE__)
#ifdef __x86_64__
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
@@ -378,9 +361,8 @@ _storebe_i64(void * __P, long long __D) {
#include <fxsrintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__XSAVE__)
/* No feature check desired due to internal MSC_VER checks */
#include <xsaveintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__XSAVEOPT__)
#include <xsaveoptintrin.h>
@@ -439,7 +421,21 @@ _storebe_i64(void * __P, long long __D) {
#include <invpcidintrin.h>
#endif
#ifdef _MSC_VER
#if !defined(_MSC_VER) || __has_feature(modules) || \
defined(__AVX512VP2INTERSECT__)
#include <avx512vp2intersectintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || \
(defined(__AVX512VL__) && defined(__AVX512VP2INTERSECT__))
#include <avx512vlvp2intersectintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__ENQCMD__)
#include <enqcmdintrin.h>
#endif
#if defined(_MSC_VER) && __has_extension(gnu_asm)
/* Define the default attributes for these intrinsics */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
#ifdef __cplusplus
@@ -521,6 +517,6 @@ _InterlockedCompareExchange64_HLERelease(__int64 volatile *_Destination,
#undef __DEFAULT_FN_ATTRS
#endif /* _MSC_VER */
#endif /* defined(_MSC_VER) && __has_extension(gnu_asm) */
#endif /* __IMMINTRIN_H */
+6 -36
View File
@@ -1,22 +1,8 @@
/* ===-------- intrin.h ---------------------------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
@@ -200,10 +186,6 @@ __attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
_WriteBarrier(void);
unsigned __int32 xbegin(void);
void _xend(void);
static __inline__
#define _XCR_XFEATURE_ENABLED_MASK 0
unsigned __int64 __cdecl _xgetbv(unsigned int);
void __cdecl _xsetbv(unsigned int, unsigned __int64);
/* These additional intrinsics are turned on in x64/amd64/x86_64 mode. */
#ifdef __x86_64__
@@ -539,12 +521,6 @@ __cpuidex(int __info[4], int __level, int __ecx) {
__asm__ ("cpuid" : "=a"(__info[0]), "=b" (__info[1]), "=c"(__info[2]), "=d"(__info[3])
: "a"(__level), "c"(__ecx));
}
static __inline__ unsigned __int64 __cdecl __DEFAULT_FN_ATTRS
_xgetbv(unsigned int __xcr_no) {
unsigned int __eax, __edx;
__asm__ ("xgetbv" : "=a" (__eax), "=d" (__edx) : "c" (__xcr_no));
return ((unsigned __int64)__edx << 32) | __eax;
}
static __inline__ void __DEFAULT_FN_ATTRS
__halt(void) {
__asm__ volatile ("hlt");
@@ -567,15 +543,9 @@ long _InterlockedAdd(long volatile *Addend, long Value);
__int64 _ReadStatusReg(int);
void _WriteStatusReg(int, __int64);
static inline unsigned short _byteswap_ushort (unsigned short val) {
return __builtin_bswap16(val);
}
static inline unsigned long _byteswap_ulong (unsigned long val) {
return __builtin_bswap32(val);
}
static inline unsigned __int64 _byteswap_uint64 (unsigned __int64 val) {
return __builtin_bswap64(val);
}
unsigned short __cdecl _byteswap_ushort(unsigned short val);
unsigned long __cdecl _byteswap_ulong (unsigned long val);
unsigned __int64 __cdecl _byteswap_uint64(unsigned __int64 val);
#endif
/*----------------------------------------------------------------------------*\
+8 -17
View File
@@ -1,27 +1,18 @@
/*===---- inttypes.h - Standard header for integer printf macros ----------===*\
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
\*===----------------------------------------------------------------------===*/
#ifndef __CLANG_INTTYPES_H
// AIX system headers need inttypes.h to be re-enterable while _STD_TYPES_T
// is defined until an inclusion of it without _STD_TYPES_T occurs, in which
// case the header guard macro is defined.
#if !defined(_AIX) || !defined(_STD_TYPES_T)
#define __CLANG_INTTYPES_H
#endif
#if defined(_MSC_VER) && _MSC_VER < 1800
#error MSVC does not have inttypes.h prior to Visual Studio 2013
+3 -17
View File
@@ -1,22 +1,8 @@
/*===------------- invpcidintrin.h - INVPCID intrinsic ---------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+3 -19
View File
@@ -1,24 +1,8 @@
/*===---- iso646.h - Standard header for alternate spellings of operators---===
*
* Copyright (c) 2008 Eli Friedman
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+3 -19
View File
@@ -1,24 +1,8 @@
/*===---- limits.h - Standard header for integer sizes --------------------===*\
*
* Copyright (c) 2009 Chris Lattner
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
\*===----------------------------------------------------------------------===*/
+3 -17
View File
@@ -1,22 +1,8 @@
/*===---- lwpintrin.h - LWP intrinsics -------------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+3 -17
View File
@@ -1,22 +1,8 @@
/*===---- lzcntintrin.h - LZCNT intrinsics ---------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+3 -17
View File
@@ -1,22 +1,8 @@
/*===---- mm3dnow.h - 3DNow! intrinsics ------------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+3 -17
View File
@@ -1,22 +1,8 @@
/*===---- mm_malloc.h - Allocating and Freeing Aligned Memory Blocks -------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+4 -18
View File
@@ -1,22 +1,8 @@
/*===---- mmintrin.h - MMX intrinsics --------------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
@@ -24,7 +10,7 @@
#ifndef __MMINTRIN_H
#define __MMINTRIN_H
typedef long long __m64 __attribute__((__vector_size__(8)));
typedef long long __m64 __attribute__((__vector_size__(8), __aligned__(8)));
typedef long long __v1di __attribute__((__vector_size__(8)));
typedef int __v2si __attribute__((__vector_size__(8)));
+4 -17
View File
@@ -1,22 +1,8 @@
/*===---- module.modulemap - intrinsics module map -------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
@@ -168,4 +154,5 @@ module _Builtin_stddef_max_align_t [system] [extern_c] {
module opencl_c {
requires opencl
header "opencl-c.h"
header "opencl-c-base.h"
}
+3 -17
View File
@@ -1,22 +1,8 @@
/*===------------------------- movdirintrin.h ------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+3 -17
View File
@@ -1,22 +1,8 @@
/*===---- msa.h - MIPS MSA intrinsics --------------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+3 -17
View File
@@ -1,22 +1,8 @@
/*===---- mwaitxintrin.h - MONITORX/MWAITX intrinsics ----------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+3 -17
View File
@@ -1,22 +1,8 @@
/*===---- nmmintrin.h - SSE4 intrinsics ------------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+578
View File
@@ -0,0 +1,578 @@
//===----- opencl-c-base.h - OpenCL C language base definitions -----------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef _OPENCL_BASE_H_
#define _OPENCL_BASE_H_
// built-in scalar data types:
/**
* An unsigned 8-bit integer.
*/
typedef unsigned char uchar;
/**
* An unsigned 16-bit integer.
*/
typedef unsigned short ushort;
/**
* An unsigned 32-bit integer.
*/
typedef unsigned int uint;
/**
* An unsigned 64-bit integer.
*/
typedef unsigned long ulong;
/**
* The unsigned integer type of the result of the sizeof operator. This
* is a 32-bit unsigned integer if CL_DEVICE_ADDRESS_BITS
* defined in table 4.3 is 32-bits and is a 64-bit unsigned integer if
* CL_DEVICE_ADDRESS_BITS is 64-bits.
*/
typedef __SIZE_TYPE__ size_t;
/**
* A signed integer type that is the result of subtracting two pointers.
* This is a 32-bit signed integer if CL_DEVICE_ADDRESS_BITS
* defined in table 4.3 is 32-bits and is a 64-bit signed integer if
* CL_DEVICE_ADDRESS_BITS is 64-bits.
*/
typedef __PTRDIFF_TYPE__ ptrdiff_t;
/**
* A signed integer type with the property that any valid pointer to
* void can be converted to this type, then converted back to pointer
* to void, and the result will compare equal to the original pointer.
*/
typedef __INTPTR_TYPE__ intptr_t;
/**
* An unsigned integer type with the property that any valid pointer to
* void can be converted to this type, then converted back to pointer
* to void, and the result will compare equal to the original pointer.
*/
typedef __UINTPTR_TYPE__ uintptr_t;
// built-in vector data types:
typedef char char2 __attribute__((ext_vector_type(2)));
typedef char char3 __attribute__((ext_vector_type(3)));
typedef char char4 __attribute__((ext_vector_type(4)));
typedef char char8 __attribute__((ext_vector_type(8)));
typedef char char16 __attribute__((ext_vector_type(16)));
typedef uchar uchar2 __attribute__((ext_vector_type(2)));
typedef uchar uchar3 __attribute__((ext_vector_type(3)));
typedef uchar uchar4 __attribute__((ext_vector_type(4)));
typedef uchar uchar8 __attribute__((ext_vector_type(8)));
typedef uchar uchar16 __attribute__((ext_vector_type(16)));
typedef short short2 __attribute__((ext_vector_type(2)));
typedef short short3 __attribute__((ext_vector_type(3)));
typedef short short4 __attribute__((ext_vector_type(4)));
typedef short short8 __attribute__((ext_vector_type(8)));
typedef short short16 __attribute__((ext_vector_type(16)));
typedef ushort ushort2 __attribute__((ext_vector_type(2)));
typedef ushort ushort3 __attribute__((ext_vector_type(3)));
typedef ushort ushort4 __attribute__((ext_vector_type(4)));
typedef ushort ushort8 __attribute__((ext_vector_type(8)));
typedef ushort ushort16 __attribute__((ext_vector_type(16)));
typedef int int2 __attribute__((ext_vector_type(2)));
typedef int int3 __attribute__((ext_vector_type(3)));
typedef int int4 __attribute__((ext_vector_type(4)));
typedef int int8 __attribute__((ext_vector_type(8)));
typedef int int16 __attribute__((ext_vector_type(16)));
typedef uint uint2 __attribute__((ext_vector_type(2)));
typedef uint uint3 __attribute__((ext_vector_type(3)));
typedef uint uint4 __attribute__((ext_vector_type(4)));
typedef uint uint8 __attribute__((ext_vector_type(8)));
typedef uint uint16 __attribute__((ext_vector_type(16)));
typedef long long2 __attribute__((ext_vector_type(2)));
typedef long long3 __attribute__((ext_vector_type(3)));
typedef long long4 __attribute__((ext_vector_type(4)));
typedef long long8 __attribute__((ext_vector_type(8)));
typedef long long16 __attribute__((ext_vector_type(16)));
typedef ulong ulong2 __attribute__((ext_vector_type(2)));
typedef ulong ulong3 __attribute__((ext_vector_type(3)));
typedef ulong ulong4 __attribute__((ext_vector_type(4)));
typedef ulong ulong8 __attribute__((ext_vector_type(8)));
typedef ulong ulong16 __attribute__((ext_vector_type(16)));
typedef float float2 __attribute__((ext_vector_type(2)));
typedef float float3 __attribute__((ext_vector_type(3)));
typedef float float4 __attribute__((ext_vector_type(4)));
typedef float float8 __attribute__((ext_vector_type(8)));
typedef float float16 __attribute__((ext_vector_type(16)));
#ifdef cl_khr_fp16
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
typedef half half2 __attribute__((ext_vector_type(2)));
typedef half half3 __attribute__((ext_vector_type(3)));
typedef half half4 __attribute__((ext_vector_type(4)));
typedef half half8 __attribute__((ext_vector_type(8)));
typedef half half16 __attribute__((ext_vector_type(16)));
#endif
#ifdef cl_khr_fp64
#if __OPENCL_C_VERSION__ < CL_VERSION_1_2
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#endif
typedef double double2 __attribute__((ext_vector_type(2)));
typedef double double3 __attribute__((ext_vector_type(3)));
typedef double double4 __attribute__((ext_vector_type(4)));
typedef double double8 __attribute__((ext_vector_type(8)));
typedef double double16 __attribute__((ext_vector_type(16)));
#endif
#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
#define NULL ((void*)0)
#endif
/**
* Value of maximum non-infinite single-precision floating-point
* number.
*/
#define MAXFLOAT 0x1.fffffep127f
/**
* A positive float constant expression. HUGE_VALF evaluates
* to +infinity. Used as an error value returned by the built-in
* math functions.
*/
#define HUGE_VALF (__builtin_huge_valf())
/**
* A positive double constant expression. HUGE_VAL evaluates
* to +infinity. Used as an error value returned by the built-in
* math functions.
*/
#define HUGE_VAL (__builtin_huge_val())
/**
* A constant expression of type float representing positive or
* unsigned infinity.
*/
#define INFINITY (__builtin_inff())
/**
* A constant expression of type float representing a quiet NaN.
*/
#define NAN as_float(INT_MAX)
#define FP_ILOGB0 INT_MIN
#define FP_ILOGBNAN INT_MAX
#define FLT_DIG 6
#define FLT_MANT_DIG 24
#define FLT_MAX_10_EXP +38
#define FLT_MAX_EXP +128
#define FLT_MIN_10_EXP -37
#define FLT_MIN_EXP -125
#define FLT_RADIX 2
#define FLT_MAX 0x1.fffffep127f
#define FLT_MIN 0x1.0p-126f
#define FLT_EPSILON 0x1.0p-23f
#define M_E_F 2.71828182845904523536028747135266250f
#define M_LOG2E_F 1.44269504088896340735992468100189214f
#define M_LOG10E_F 0.434294481903251827651128918916605082f
#define M_LN2_F 0.693147180559945309417232121458176568f
#define M_LN10_F 2.30258509299404568401799145468436421f
#define M_PI_F 3.14159265358979323846264338327950288f
#define M_PI_2_F 1.57079632679489661923132169163975144f
#define M_PI_4_F 0.785398163397448309615660845819875721f
#define M_1_PI_F 0.318309886183790671537767526745028724f
#define M_2_PI_F 0.636619772367581343075535053490057448f
#define M_2_SQRTPI_F 1.12837916709551257389615890312154517f
#define M_SQRT2_F 1.41421356237309504880168872420969808f
#define M_SQRT1_2_F 0.707106781186547524400844362104849039f
#define DBL_DIG 15
#define DBL_MANT_DIG 53
#define DBL_MAX_10_EXP +308
#define DBL_MAX_EXP +1024
#define DBL_MIN_10_EXP -307
#define DBL_MIN_EXP -1021
#define DBL_RADIX 2
#define DBL_MAX 0x1.fffffffffffffp1023
#define DBL_MIN 0x1.0p-1022
#define DBL_EPSILON 0x1.0p-52
#define M_E 0x1.5bf0a8b145769p+1
#define M_LOG2E 0x1.71547652b82fep+0
#define M_LOG10E 0x1.bcb7b1526e50ep-2
#define M_LN2 0x1.62e42fefa39efp-1
#define M_LN10 0x1.26bb1bbb55516p+1
#define M_PI 0x1.921fb54442d18p+1
#define M_PI_2 0x1.921fb54442d18p+0
#define M_PI_4 0x1.921fb54442d18p-1
#define M_1_PI 0x1.45f306dc9c883p-2
#define M_2_PI 0x1.45f306dc9c883p-1
#define M_2_SQRTPI 0x1.20dd750429b6dp+0
#define M_SQRT2 0x1.6a09e667f3bcdp+0
#define M_SQRT1_2 0x1.6a09e667f3bcdp-1
#ifdef cl_khr_fp16
#define HALF_DIG 3
#define HALF_MANT_DIG 11
#define HALF_MAX_10_EXP +4
#define HALF_MAX_EXP +16
#define HALF_MIN_10_EXP -4
#define HALF_MIN_EXP -13
#define HALF_RADIX 2
#define HALF_MAX ((0x1.ffcp15h))
#define HALF_MIN ((0x1.0p-14h))
#define HALF_EPSILON ((0x1.0p-10h))
#define M_E_H 2.71828182845904523536028747135266250h
#define M_LOG2E_H 1.44269504088896340735992468100189214h
#define M_LOG10E_H 0.434294481903251827651128918916605082h
#define M_LN2_H 0.693147180559945309417232121458176568h
#define M_LN10_H 2.30258509299404568401799145468436421h
#define M_PI_H 3.14159265358979323846264338327950288h
#define M_PI_2_H 1.57079632679489661923132169163975144h
#define M_PI_4_H 0.785398163397448309615660845819875721h
#define M_1_PI_H 0.318309886183790671537767526745028724h
#define M_2_PI_H 0.636619772367581343075535053490057448h
#define M_2_SQRTPI_H 1.12837916709551257389615890312154517h
#define M_SQRT2_H 1.41421356237309504880168872420969808h
#define M_SQRT1_2_H 0.707106781186547524400844362104849039h
#endif //cl_khr_fp16
#define CHAR_BIT 8
#define SCHAR_MAX 127
#define SCHAR_MIN (-128)
#define UCHAR_MAX 255
#define CHAR_MAX SCHAR_MAX
#define CHAR_MIN SCHAR_MIN
#define USHRT_MAX 65535
#define SHRT_MAX 32767
#define SHRT_MIN (-32768)
#define UINT_MAX 0xffffffff
#define INT_MAX 2147483647
#define INT_MIN (-2147483647-1)
#define ULONG_MAX 0xffffffffffffffffUL
#define LONG_MAX 0x7fffffffffffffffL
#define LONG_MIN (-0x7fffffffffffffffL-1)
// OpenCL v1.1 s6.11.8, v1.2 s6.12.8, v2.0 s6.13.8 - Synchronization Functions
// Flag type and values for barrier, mem_fence, read_mem_fence, write_mem_fence
typedef uint cl_mem_fence_flags;
/**
* Queue a memory fence to ensure correct
* ordering of memory operations to local memory
*/
#define CLK_LOCAL_MEM_FENCE 0x01
/**
* Queue a memory fence to ensure correct
* ordering of memory operations to global memory
*/
#define CLK_GLOBAL_MEM_FENCE 0x02
#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
typedef enum memory_scope {
memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups)
memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
#endif
} memory_scope;
#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
/**
* Queue a memory fence to ensure correct ordering of memory
* operations between work-items of a work-group to
* image memory.
*/
#define CLK_IMAGE_MEM_FENCE 0x04
#ifndef ATOMIC_VAR_INIT
#define ATOMIC_VAR_INIT(x) (x)
#endif //ATOMIC_VAR_INIT
#define ATOMIC_FLAG_INIT 0
// enum values aligned with what clang uses in EmitAtomicExpr()
typedef enum memory_order
{
memory_order_relaxed = __ATOMIC_RELAXED,
memory_order_acquire = __ATOMIC_ACQUIRE,
memory_order_release = __ATOMIC_RELEASE,
memory_order_acq_rel = __ATOMIC_ACQ_REL,
memory_order_seq_cst = __ATOMIC_SEQ_CST
} memory_order;
#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
// OpenCL v1.1 s6.11.3, v1.2 s6.12.14, v2.0 s6.13.14 - Image Read and Write Functions
// These values need to match the runtime equivalent
//
// Addressing Mode.
//
#define CLK_ADDRESS_NONE 0
#define CLK_ADDRESS_CLAMP_TO_EDGE 2
#define CLK_ADDRESS_CLAMP 4
#define CLK_ADDRESS_REPEAT 6
#define CLK_ADDRESS_MIRRORED_REPEAT 8
//
// Coordination Normalization
//
#define CLK_NORMALIZED_COORDS_FALSE 0
#define CLK_NORMALIZED_COORDS_TRUE 1
//
// Filtering Mode.
//
#define CLK_FILTER_NEAREST 0x10
#define CLK_FILTER_LINEAR 0x20
#ifdef cl_khr_gl_msaa_sharing
#pragma OPENCL EXTENSION cl_khr_gl_msaa_sharing : enable
#endif //cl_khr_gl_msaa_sharing
//
// Channel Datatype.
//
#define CLK_SNORM_INT8 0x10D0
#define CLK_SNORM_INT16 0x10D1
#define CLK_UNORM_INT8 0x10D2
#define CLK_UNORM_INT16 0x10D3
#define CLK_UNORM_SHORT_565 0x10D4
#define CLK_UNORM_SHORT_555 0x10D5
#define CLK_UNORM_INT_101010 0x10D6
#define CLK_SIGNED_INT8 0x10D7
#define CLK_SIGNED_INT16 0x10D8
#define CLK_SIGNED_INT32 0x10D9
#define CLK_UNSIGNED_INT8 0x10DA
#define CLK_UNSIGNED_INT16 0x10DB
#define CLK_UNSIGNED_INT32 0x10DC
#define CLK_HALF_FLOAT 0x10DD
#define CLK_FLOAT 0x10DE
#define CLK_UNORM_INT24 0x10DF
// Channel order, numbering must be aligned with cl_channel_order in cl.h
//
#define CLK_R 0x10B0
#define CLK_A 0x10B1
#define CLK_RG 0x10B2
#define CLK_RA 0x10B3
#define CLK_RGB 0x10B4
#define CLK_RGBA 0x10B5
#define CLK_BGRA 0x10B6
#define CLK_ARGB 0x10B7
#define CLK_INTENSITY 0x10B8
#define CLK_LUMINANCE 0x10B9
#define CLK_Rx 0x10BA
#define CLK_RGx 0x10BB
#define CLK_RGBx 0x10BC
#define CLK_DEPTH 0x10BD
#define CLK_DEPTH_STENCIL 0x10BE
#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
#define CLK_sRGB 0x10BF
#define CLK_sRGBx 0x10C0
#define CLK_sRGBA 0x10C1
#define CLK_sBGRA 0x10C2
#define CLK_ABGR 0x10C3
#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
// OpenCL v2.0 s6.13.16 - Pipe Functions
#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
#define CLK_NULL_RESERVE_ID (__builtin_astype(((void*)(__SIZE_MAX__)), reserve_id_t))
#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
// OpenCL v2.0 s6.13.17 - Enqueue Kernels
#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
#define CL_COMPLETE 0x0
#define CL_RUNNING 0x1
#define CL_SUBMITTED 0x2
#define CL_QUEUED 0x3
#define CLK_SUCCESS 0
#define CLK_ENQUEUE_FAILURE -101
#define CLK_INVALID_QUEUE -102
#define CLK_INVALID_NDRANGE -160
#define CLK_INVALID_EVENT_WAIT_LIST -57
#define CLK_DEVICE_QUEUE_FULL -161
#define CLK_INVALID_ARG_SIZE -51
#define CLK_EVENT_ALLOCATION_FAILURE -100
#define CLK_OUT_OF_RESOURCES -5
#define CLK_NULL_QUEUE 0
#define CLK_NULL_EVENT (__builtin_astype(((void*)(__SIZE_MAX__)), clk_event_t))
// execution model related definitions
#define CLK_ENQUEUE_FLAGS_NO_WAIT 0x0
#define CLK_ENQUEUE_FLAGS_WAIT_KERNEL 0x1
#define CLK_ENQUEUE_FLAGS_WAIT_WORK_GROUP 0x2
typedef int kernel_enqueue_flags_t;
typedef int clk_profiling_info;
// Profiling info name (see capture_event_profiling_info)
#define CLK_PROFILING_COMMAND_EXEC_TIME 0x1
#define MAX_WORK_DIM 3
typedef struct {
unsigned int workDimension;
size_t globalWorkOffset[MAX_WORK_DIM];
size_t globalWorkSize[MAX_WORK_DIM];
size_t localWorkSize[MAX_WORK_DIM];
} ndrange_t;
#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
#ifdef cl_intel_device_side_avc_motion_estimation
#pragma OPENCL EXTENSION cl_intel_device_side_avc_motion_estimation : begin
#define CLK_AVC_ME_MAJOR_16x16_INTEL 0x0
#define CLK_AVC_ME_MAJOR_16x8_INTEL 0x1
#define CLK_AVC_ME_MAJOR_8x16_INTEL 0x2
#define CLK_AVC_ME_MAJOR_8x8_INTEL 0x3
#define CLK_AVC_ME_MINOR_8x8_INTEL 0x0
#define CLK_AVC_ME_MINOR_8x4_INTEL 0x1
#define CLK_AVC_ME_MINOR_4x8_INTEL 0x2
#define CLK_AVC_ME_MINOR_4x4_INTEL 0x3
#define CLK_AVC_ME_MAJOR_FORWARD_INTEL 0x0
#define CLK_AVC_ME_MAJOR_BACKWARD_INTEL 0x1
#define CLK_AVC_ME_MAJOR_BIDIRECTIONAL_INTEL 0x2
#define CLK_AVC_ME_PARTITION_MASK_ALL_INTEL 0x0
#define CLK_AVC_ME_PARTITION_MASK_16x16_INTEL 0x7E
#define CLK_AVC_ME_PARTITION_MASK_16x8_INTEL 0x7D
#define CLK_AVC_ME_PARTITION_MASK_8x16_INTEL 0x7B
#define CLK_AVC_ME_PARTITION_MASK_8x8_INTEL 0x77
#define CLK_AVC_ME_PARTITION_MASK_8x4_INTEL 0x6F
#define CLK_AVC_ME_PARTITION_MASK_4x8_INTEL 0x5F
#define CLK_AVC_ME_PARTITION_MASK_4x4_INTEL 0x3F
#define CLK_AVC_ME_SLICE_TYPE_PRED_INTEL 0x0
#define CLK_AVC_ME_SLICE_TYPE_BPRED_INTEL 0x1
#define CLK_AVC_ME_SLICE_TYPE_INTRA_INTEL 0x2
#define CLK_AVC_ME_SEARCH_WINDOW_EXHAUSTIVE_INTEL 0x0
#define CLK_AVC_ME_SEARCH_WINDOW_SMALL_INTEL 0x1
#define CLK_AVC_ME_SEARCH_WINDOW_TINY_INTEL 0x2
#define CLK_AVC_ME_SEARCH_WINDOW_EXTRA_TINY_INTEL 0x3
#define CLK_AVC_ME_SEARCH_WINDOW_DIAMOND_INTEL 0x4
#define CLK_AVC_ME_SEARCH_WINDOW_LARGE_DIAMOND_INTEL 0x5
#define CLK_AVC_ME_SEARCH_WINDOW_RESERVED0_INTEL 0x6
#define CLK_AVC_ME_SEARCH_WINDOW_RESERVED1_INTEL 0x7
#define CLK_AVC_ME_SEARCH_WINDOW_CUSTOM_INTEL 0x8
#define CLK_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL 0x0
#define CLK_AVC_ME_SAD_ADJUST_MODE_HAAR_INTEL 0x2
#define CLK_AVC_ME_SUBPIXEL_MODE_INTEGER_INTEL 0x0
#define CLK_AVC_ME_SUBPIXEL_MODE_HPEL_INTEL 0x1
#define CLK_AVC_ME_SUBPIXEL_MODE_QPEL_INTEL 0x3
#define CLK_AVC_ME_COST_PRECISION_QPEL_INTEL 0x0
#define CLK_AVC_ME_COST_PRECISION_HPEL_INTEL 0x1
#define CLK_AVC_ME_COST_PRECISION_PEL_INTEL 0x2
#define CLK_AVC_ME_COST_PRECISION_DPEL_INTEL 0x3
#define CLK_AVC_ME_BIDIR_WEIGHT_QUARTER_INTEL 0x10
#define CLK_AVC_ME_BIDIR_WEIGHT_THIRD_INTEL 0x15
#define CLK_AVC_ME_BIDIR_WEIGHT_HALF_INTEL 0x20
#define CLK_AVC_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL 0x2B
#define CLK_AVC_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL 0x30
#define CLK_AVC_ME_BORDER_REACHED_LEFT_INTEL 0x0
#define CLK_AVC_ME_BORDER_REACHED_RIGHT_INTEL 0x2
#define CLK_AVC_ME_BORDER_REACHED_TOP_INTEL 0x4
#define CLK_AVC_ME_BORDER_REACHED_BOTTOM_INTEL 0x8
#define CLK_AVC_ME_INTRA_16x16_INTEL 0x0
#define CLK_AVC_ME_INTRA_8x8_INTEL 0x1
#define CLK_AVC_ME_INTRA_4x4_INTEL 0x2
#define CLK_AVC_ME_SKIP_BLOCK_PARTITION_16x16_INTEL 0x0
#define CLK_AVC_ME_SKIP_BLOCK_PARTITION_8x8_INTEL 0x4000
#define CLK_AVC_ME_SKIP_BLOCK_16x16_FORWARD_ENABLE_INTEL (0x1 << 24)
#define CLK_AVC_ME_SKIP_BLOCK_16x16_BACKWARD_ENABLE_INTEL (0x2 << 24)
#define CLK_AVC_ME_SKIP_BLOCK_16x16_DUAL_ENABLE_INTEL (0x3 << 24)
#define CLK_AVC_ME_SKIP_BLOCK_8x8_FORWARD_ENABLE_INTEL (0x55 << 24)
#define CLK_AVC_ME_SKIP_BLOCK_8x8_BACKWARD_ENABLE_INTEL (0xAA << 24)
#define CLK_AVC_ME_SKIP_BLOCK_8x8_DUAL_ENABLE_INTEL (0xFF << 24)
#define CLK_AVC_ME_SKIP_BLOCK_8x8_0_FORWARD_ENABLE_INTEL (0x1 << 24)
#define CLK_AVC_ME_SKIP_BLOCK_8x8_0_BACKWARD_ENABLE_INTEL (0x2 << 24)
#define CLK_AVC_ME_SKIP_BLOCK_8x8_1_FORWARD_ENABLE_INTEL (0x1 << 26)
#define CLK_AVC_ME_SKIP_BLOCK_8x8_1_BACKWARD_ENABLE_INTEL (0x2 << 26)
#define CLK_AVC_ME_SKIP_BLOCK_8x8_2_FORWARD_ENABLE_INTEL (0x1 << 28)
#define CLK_AVC_ME_SKIP_BLOCK_8x8_2_BACKWARD_ENABLE_INTEL (0x2 << 28)
#define CLK_AVC_ME_SKIP_BLOCK_8x8_3_FORWARD_ENABLE_INTEL (0x1 << 30)
#define CLK_AVC_ME_SKIP_BLOCK_8x8_3_BACKWARD_ENABLE_INTEL (0x2 << 30)
#define CLK_AVC_ME_BLOCK_BASED_SKIP_4x4_INTEL 0x00
#define CLK_AVC_ME_BLOCK_BASED_SKIP_8x8_INTEL 0x80
#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_ALL_INTEL 0x0
#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL 0x6
#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL 0x5
#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL 0x3
#define CLK_AVC_ME_INTRA_NEIGHBOR_LEFT_MASK_ENABLE_INTEL 0x60
#define CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_MASK_ENABLE_INTEL 0x10
#define CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_RIGHT_MASK_ENABLE_INTEL 0x8
#define CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_LEFT_MASK_ENABLE_INTEL 0x4
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL 0x0
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_DC_INTEL 0x2
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL 0x3
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL 0x4
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL 0x5
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL 0x6
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL 0x7
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL 0x8
#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_DC_INTEL 0x0
#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1
#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL 0x2
#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL 0x3
#define CLK_AVC_ME_FRAME_FORWARD_INTEL 0x1
#define CLK_AVC_ME_FRAME_BACKWARD_INTEL 0x2
#define CLK_AVC_ME_FRAME_DUAL_INTEL 0x3
#define CLK_AVC_ME_INTERLACED_SCAN_TOP_FIELD_INTEL 0x0
#define CLK_AVC_ME_INTERLACED_SCAN_BOTTOM_FIELD_INTEL 0x1
#define CLK_AVC_ME_INITIALIZE_INTEL 0x0
#define CLK_AVC_IME_PAYLOAD_INITIALIZE_INTEL 0x0
#define CLK_AVC_REF_PAYLOAD_INITIALIZE_INTEL 0x0
#define CLK_AVC_SIC_PAYLOAD_INITIALIZE_INTEL 0x0
#define CLK_AVC_IME_RESULT_INITIALIZE_INTEL 0x0
#define CLK_AVC_REF_RESULT_INITIALIZE_INTEL 0x0
#define CLK_AVC_SIC_RESULT_INITIALIZE_INTEL 0x0
#define CLK_AVC_IME_RESULT_SINGLE_REFERENCE_STREAMOUT_INITIALIZE_INTEL 0x0
#define CLK_AVC_IME_RESULT_SINGLE_REFERENCE_STREAMIN_INITIALIZE_INTEL 0x0
#define CLK_AVC_IME_RESULT_DUAL_REFERENCE_STREAMOUT_INITIALIZE_INTEL 0x0
#define CLK_AVC_IME_RESULT_DUAL_REFERENCE_STREAMIN_INITIALIZE_INTEL 0x0
#pragma OPENCL EXTENSION cl_intel_device_side_avc_motion_estimation : end
#endif // cl_intel_device_side_avc_motion_estimation
#endif //_OPENCL_BASE_H_
+79 -619
View File
@@ -1,15 +1,16 @@
//===--- opencl-c.h - OpenCL C language builtin function header -----------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef _OPENCL_H_
#define _OPENCL_H_
#include "opencl-c-base.h"
#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
#ifndef cl_khr_depth_images
#define cl_khr_depth_images
@@ -23,9 +24,6 @@
#endif //__OPENCL_C_VERSION__ < CL_VERSION_2_0
#if __OPENCL_C_VERSION__ >= CL_VERSION_1_2
#ifndef cl_intel_planar_yuv
#define cl_intel_planar_yuv
#endif // cl_intel_planar_yuv
#pragma OPENCL EXTENSION cl_intel_planar_yuv : begin
#pragma OPENCL EXTENSION cl_intel_planar_yuv : end
#endif // __OPENCL_C_VERSION__ >= CL_VERSION_1_2
@@ -37,255 +35,6 @@
#define __purefn __attribute__((pure))
#define __cnfn __attribute__((const))
// built-in scalar data types:
/**
* An unsigned 8-bit integer.
*/
typedef unsigned char uchar;
/**
* An unsigned 16-bit integer.
*/
typedef unsigned short ushort;
/**
* An unsigned 32-bit integer.
*/
typedef unsigned int uint;
/**
* An unsigned 64-bit integer.
*/
typedef unsigned long ulong;
/**
* The unsigned integer type of the result of the sizeof operator. This
* is a 32-bit unsigned integer if CL_DEVICE_ADDRESS_BITS
* defined in table 4.3 is 32-bits and is a 64-bit unsigned integer if
* CL_DEVICE_ADDRESS_BITS is 64-bits.
*/
typedef __SIZE_TYPE__ size_t;
/**
* A signed integer type that is the result of subtracting two pointers.
* This is a 32-bit signed integer if CL_DEVICE_ADDRESS_BITS
* defined in table 4.3 is 32-bits and is a 64-bit signed integer if
* CL_DEVICE_ADDRESS_BITS is 64-bits.
*/
typedef __PTRDIFF_TYPE__ ptrdiff_t;
/**
* A signed integer type with the property that any valid pointer to
* void can be converted to this type, then converted back to pointer
* to void, and the result will compare equal to the original pointer.
*/
typedef __INTPTR_TYPE__ intptr_t;
/**
* An unsigned integer type with the property that any valid pointer to
* void can be converted to this type, then converted back to pointer
* to void, and the result will compare equal to the original pointer.
*/
typedef __UINTPTR_TYPE__ uintptr_t;
// built-in vector data types:
typedef char char2 __attribute__((ext_vector_type(2)));
typedef char char3 __attribute__((ext_vector_type(3)));
typedef char char4 __attribute__((ext_vector_type(4)));
typedef char char8 __attribute__((ext_vector_type(8)));
typedef char char16 __attribute__((ext_vector_type(16)));
typedef uchar uchar2 __attribute__((ext_vector_type(2)));
typedef uchar uchar3 __attribute__((ext_vector_type(3)));
typedef uchar uchar4 __attribute__((ext_vector_type(4)));
typedef uchar uchar8 __attribute__((ext_vector_type(8)));
typedef uchar uchar16 __attribute__((ext_vector_type(16)));
typedef short short2 __attribute__((ext_vector_type(2)));
typedef short short3 __attribute__((ext_vector_type(3)));
typedef short short4 __attribute__((ext_vector_type(4)));
typedef short short8 __attribute__((ext_vector_type(8)));
typedef short short16 __attribute__((ext_vector_type(16)));
typedef ushort ushort2 __attribute__((ext_vector_type(2)));
typedef ushort ushort3 __attribute__((ext_vector_type(3)));
typedef ushort ushort4 __attribute__((ext_vector_type(4)));
typedef ushort ushort8 __attribute__((ext_vector_type(8)));
typedef ushort ushort16 __attribute__((ext_vector_type(16)));
typedef int int2 __attribute__((ext_vector_type(2)));
typedef int int3 __attribute__((ext_vector_type(3)));
typedef int int4 __attribute__((ext_vector_type(4)));
typedef int int8 __attribute__((ext_vector_type(8)));
typedef int int16 __attribute__((ext_vector_type(16)));
typedef uint uint2 __attribute__((ext_vector_type(2)));
typedef uint uint3 __attribute__((ext_vector_type(3)));
typedef uint uint4 __attribute__((ext_vector_type(4)));
typedef uint uint8 __attribute__((ext_vector_type(8)));
typedef uint uint16 __attribute__((ext_vector_type(16)));
typedef long long2 __attribute__((ext_vector_type(2)));
typedef long long3 __attribute__((ext_vector_type(3)));
typedef long long4 __attribute__((ext_vector_type(4)));
typedef long long8 __attribute__((ext_vector_type(8)));
typedef long long16 __attribute__((ext_vector_type(16)));
typedef ulong ulong2 __attribute__((ext_vector_type(2)));
typedef ulong ulong3 __attribute__((ext_vector_type(3)));
typedef ulong ulong4 __attribute__((ext_vector_type(4)));
typedef ulong ulong8 __attribute__((ext_vector_type(8)));
typedef ulong ulong16 __attribute__((ext_vector_type(16)));
typedef float float2 __attribute__((ext_vector_type(2)));
typedef float float3 __attribute__((ext_vector_type(3)));
typedef float float4 __attribute__((ext_vector_type(4)));
typedef float float8 __attribute__((ext_vector_type(8)));
typedef float float16 __attribute__((ext_vector_type(16)));
#ifdef cl_khr_fp16
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
typedef half half2 __attribute__((ext_vector_type(2)));
typedef half half3 __attribute__((ext_vector_type(3)));
typedef half half4 __attribute__((ext_vector_type(4)));
typedef half half8 __attribute__((ext_vector_type(8)));
typedef half half16 __attribute__((ext_vector_type(16)));
#endif
#ifdef cl_khr_fp64
#if __OPENCL_C_VERSION__ < CL_VERSION_1_2
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#endif
typedef double double2 __attribute__((ext_vector_type(2)));
typedef double double3 __attribute__((ext_vector_type(3)));
typedef double double4 __attribute__((ext_vector_type(4)));
typedef double double8 __attribute__((ext_vector_type(8)));
typedef double double16 __attribute__((ext_vector_type(16)));
#endif
#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
#define NULL ((void*)0)
#endif
/**
* Value of maximum non-infinite single-precision floating-point
* number.
*/
#define MAXFLOAT 0x1.fffffep127f
/**
* A positive float constant expression. HUGE_VALF evaluates
* to +infinity. Used as an error value returned by the built-in
* math functions.
*/
#define HUGE_VALF (__builtin_huge_valf())
/**
* A positive double constant expression. HUGE_VAL evaluates
* to +infinity. Used as an error value returned by the built-in
* math functions.
*/
#define HUGE_VAL (__builtin_huge_val())
/**
* A constant expression of type float representing positive or
* unsigned infinity.
*/
#define INFINITY (__builtin_inff())
/**
* A constant expression of type float representing a quiet NaN.
*/
#define NAN as_float(INT_MAX)
#define FP_ILOGB0 INT_MIN
#define FP_ILOGBNAN INT_MAX
#define FLT_DIG 6
#define FLT_MANT_DIG 24
#define FLT_MAX_10_EXP +38
#define FLT_MAX_EXP +128
#define FLT_MIN_10_EXP -37
#define FLT_MIN_EXP -125
#define FLT_RADIX 2
#define FLT_MAX 0x1.fffffep127f
#define FLT_MIN 0x1.0p-126f
#define FLT_EPSILON 0x1.0p-23f
#define M_E_F 2.71828182845904523536028747135266250f
#define M_LOG2E_F 1.44269504088896340735992468100189214f
#define M_LOG10E_F 0.434294481903251827651128918916605082f
#define M_LN2_F 0.693147180559945309417232121458176568f
#define M_LN10_F 2.30258509299404568401799145468436421f
#define M_PI_F 3.14159265358979323846264338327950288f
#define M_PI_2_F 1.57079632679489661923132169163975144f
#define M_PI_4_F 0.785398163397448309615660845819875721f
#define M_1_PI_F 0.318309886183790671537767526745028724f
#define M_2_PI_F 0.636619772367581343075535053490057448f
#define M_2_SQRTPI_F 1.12837916709551257389615890312154517f
#define M_SQRT2_F 1.41421356237309504880168872420969808f
#define M_SQRT1_2_F 0.707106781186547524400844362104849039f
#define DBL_DIG 15
#define DBL_MANT_DIG 53
#define DBL_MAX_10_EXP +308
#define DBL_MAX_EXP +1024
#define DBL_MIN_10_EXP -307
#define DBL_MIN_EXP -1021
#define DBL_RADIX 2
#define DBL_MAX 0x1.fffffffffffffp1023
#define DBL_MIN 0x1.0p-1022
#define DBL_EPSILON 0x1.0p-52
#define M_E 0x1.5bf0a8b145769p+1
#define M_LOG2E 0x1.71547652b82fep+0
#define M_LOG10E 0x1.bcb7b1526e50ep-2
#define M_LN2 0x1.62e42fefa39efp-1
#define M_LN10 0x1.26bb1bbb55516p+1
#define M_PI 0x1.921fb54442d18p+1
#define M_PI_2 0x1.921fb54442d18p+0
#define M_PI_4 0x1.921fb54442d18p-1
#define M_1_PI 0x1.45f306dc9c883p-2
#define M_2_PI 0x1.45f306dc9c883p-1
#define M_2_SQRTPI 0x1.20dd750429b6dp+0
#define M_SQRT2 0x1.6a09e667f3bcdp+0
#define M_SQRT1_2 0x1.6a09e667f3bcdp-1
#ifdef cl_khr_fp16
#define HALF_DIG 3
#define HALF_MANT_DIG 11
#define HALF_MAX_10_EXP +4
#define HALF_MAX_EXP +16
#define HALF_MIN_10_EXP -4
#define HALF_MIN_EXP -13
#define HALF_RADIX 2
#define HALF_MAX ((0x1.ffcp15h))
#define HALF_MIN ((0x1.0p-14h))
#define HALF_EPSILON ((0x1.0p-10h))
#define M_E_H 2.71828182845904523536028747135266250h
#define M_LOG2E_H 1.44269504088896340735992468100189214h
#define M_LOG10E_H 0.434294481903251827651128918916605082h
#define M_LN2_H 0.693147180559945309417232121458176568h
#define M_LN10_H 2.30258509299404568401799145468436421h
#define M_PI_H 3.14159265358979323846264338327950288h
#define M_PI_2_H 1.57079632679489661923132169163975144h
#define M_PI_4_H 0.785398163397448309615660845819875721h
#define M_1_PI_H 0.318309886183790671537767526745028724h
#define M_2_PI_H 0.636619772367581343075535053490057448h
#define M_2_SQRTPI_H 1.12837916709551257389615890312154517h
#define M_SQRT2_H 1.41421356237309504880168872420969808h
#define M_SQRT1_2_H 0.707106781186547524400844362104849039h
#endif //cl_khr_fp16
#define CHAR_BIT 8
#define SCHAR_MAX 127
#define SCHAR_MIN (-128)
#define UCHAR_MAX 255
#define CHAR_MAX SCHAR_MAX
#define CHAR_MIN SCHAR_MIN
#define USHRT_MAX 65535
#define SHRT_MAX 32767
#define SHRT_MIN (-32768)
#define UINT_MAX 0xffffffff
#define INT_MAX 2147483647
#define INT_MIN (-2147483647-1)
#define ULONG_MAX 0xffffffffffffffffUL
#define LONG_MAX 0x7fffffffffffffffL
#define LONG_MIN (-0x7fffffffffffffffL-1)
// OpenCL v1.1/1.2/2.0 s6.2.3 - Explicit conversions
@@ -9598,8 +9347,6 @@ long8 __ovld __cnfn clamp(long8 x, long8 minval, long8 maxval);
ulong8 __ovld __cnfn clamp(ulong8 x, ulong8 minval, ulong8 maxval);
long16 __ovld __cnfn clamp(long16 x, long16 minval, long16 maxval);
ulong16 __ovld __cnfn clamp(ulong16 x, ulong16 minval, ulong16 maxval);
char __ovld __cnfn clamp(char x, char minval, char maxval);
uchar __ovld __cnfn clamp(uchar x, uchar minval, uchar maxval);
char2 __ovld __cnfn clamp(char2 x, char minval, char maxval);
uchar2 __ovld __cnfn clamp(uchar2 x, uchar minval, uchar maxval);
char3 __ovld __cnfn clamp(char3 x, char minval, char maxval);
@@ -9610,8 +9357,6 @@ char8 __ovld __cnfn clamp(char8 x, char minval, char maxval);
uchar8 __ovld __cnfn clamp(uchar8 x, uchar minval, uchar maxval);
char16 __ovld __cnfn clamp(char16 x, char minval, char maxval);
uchar16 __ovld __cnfn clamp(uchar16 x, uchar minval, uchar maxval);
short __ovld __cnfn clamp(short x, short minval, short maxval);
ushort __ovld __cnfn clamp(ushort x, ushort minval, ushort maxval);
short2 __ovld __cnfn clamp(short2 x, short minval, short maxval);
ushort2 __ovld __cnfn clamp(ushort2 x, ushort minval, ushort maxval);
short3 __ovld __cnfn clamp(short3 x, short minval, short maxval);
@@ -9622,8 +9367,6 @@ short8 __ovld __cnfn clamp(short8 x, short minval, short maxval);
ushort8 __ovld __cnfn clamp(ushort8 x, ushort minval, ushort maxval);
short16 __ovld __cnfn clamp(short16 x, short minval, short maxval);
ushort16 __ovld __cnfn clamp(ushort16 x, ushort minval, ushort maxval);
int __ovld __cnfn clamp(int x, int minval, int maxval);
uint __ovld __cnfn clamp(uint x, uint minval, uint maxval);
int2 __ovld __cnfn clamp(int2 x, int minval, int maxval);
uint2 __ovld __cnfn clamp(uint2 x, uint minval, uint maxval);
int3 __ovld __cnfn clamp(int3 x, int minval, int maxval);
@@ -9634,8 +9377,6 @@ int8 __ovld __cnfn clamp(int8 x, int minval, int maxval);
uint8 __ovld __cnfn clamp(uint8 x, uint minval, uint maxval);
int16 __ovld __cnfn clamp(int16 x, int minval, int maxval);
uint16 __ovld __cnfn clamp(uint16 x, uint minval, uint maxval);
long __ovld __cnfn clamp(long x, long minval, long maxval);
ulong __ovld __cnfn clamp(ulong x, ulong minval, ulong maxval);
long2 __ovld __cnfn clamp(long2 x, long minval, long maxval);
ulong2 __ovld __cnfn clamp(ulong2 x, ulong minval, ulong maxval);
long3 __ovld __cnfn clamp(long3 x, long minval, long maxval);
@@ -9911,8 +9652,6 @@ long8 __ovld __cnfn max(long8 x, long8 y);
ulong8 __ovld __cnfn max(ulong8 x, ulong8 y);
long16 __ovld __cnfn max(long16 x, long16 y);
ulong16 __ovld __cnfn max(ulong16 x, ulong16 y);
char __ovld __cnfn max(char x, char y);
uchar __ovld __cnfn max(uchar x, uchar y);
char2 __ovld __cnfn max(char2 x, char y);
uchar2 __ovld __cnfn max(uchar2 x, uchar y);
char3 __ovld __cnfn max(char3 x, char y);
@@ -9923,8 +9662,6 @@ char8 __ovld __cnfn max(char8 x, char y);
uchar8 __ovld __cnfn max(uchar8 x, uchar y);
char16 __ovld __cnfn max(char16 x, char y);
uchar16 __ovld __cnfn max(uchar16 x, uchar y);
short __ovld __cnfn max(short x, short y);
ushort __ovld __cnfn max(ushort x, ushort y);
short2 __ovld __cnfn max(short2 x, short y);
ushort2 __ovld __cnfn max(ushort2 x, ushort y);
short3 __ovld __cnfn max(short3 x, short y);
@@ -9935,8 +9672,6 @@ short8 __ovld __cnfn max(short8 x, short y);
ushort8 __ovld __cnfn max(ushort8 x, ushort y);
short16 __ovld __cnfn max(short16 x, short y);
ushort16 __ovld __cnfn max(ushort16 x, ushort y);
int __ovld __cnfn max(int x, int y);
uint __ovld __cnfn max(uint x, uint y);
int2 __ovld __cnfn max(int2 x, int y);
uint2 __ovld __cnfn max(uint2 x, uint y);
int3 __ovld __cnfn max(int3 x, int y);
@@ -9947,8 +9682,6 @@ int8 __ovld __cnfn max(int8 x, int y);
uint8 __ovld __cnfn max(uint8 x, uint y);
int16 __ovld __cnfn max(int16 x, int y);
uint16 __ovld __cnfn max(uint16 x, uint y);
long __ovld __cnfn max(long x, long y);
ulong __ovld __cnfn max(ulong x, ulong y);
long2 __ovld __cnfn max(long2 x, long y);
ulong2 __ovld __cnfn max(ulong2 x, ulong y);
long3 __ovld __cnfn max(long3 x, long y);
@@ -10011,8 +9744,6 @@ long8 __ovld __cnfn min(long8 x, long8 y);
ulong8 __ovld __cnfn min(ulong8 x, ulong8 y);
long16 __ovld __cnfn min(long16 x, long16 y);
ulong16 __ovld __cnfn min(ulong16 x, ulong16 y);
char __ovld __cnfn min(char x, char y);
uchar __ovld __cnfn min(uchar x, uchar y);
char2 __ovld __cnfn min(char2 x, char y);
uchar2 __ovld __cnfn min(uchar2 x, uchar y);
char3 __ovld __cnfn min(char3 x, char y);
@@ -10023,8 +9754,6 @@ char8 __ovld __cnfn min(char8 x, char y);
uchar8 __ovld __cnfn min(uchar8 x, uchar y);
char16 __ovld __cnfn min(char16 x, char y);
uchar16 __ovld __cnfn min(uchar16 x, uchar y);
short __ovld __cnfn min(short x, short y);
ushort __ovld __cnfn min(ushort x, ushort y);
short2 __ovld __cnfn min(short2 x, short y);
ushort2 __ovld __cnfn min(ushort2 x, ushort y);
short3 __ovld __cnfn min(short3 x, short y);
@@ -10035,8 +9764,6 @@ short8 __ovld __cnfn min(short8 x, short y);
ushort8 __ovld __cnfn min(ushort8 x, ushort y);
short16 __ovld __cnfn min(short16 x, short y);
ushort16 __ovld __cnfn min(ushort16 x, ushort y);
int __ovld __cnfn min(int x, int y);
uint __ovld __cnfn min(uint x, uint y);
int2 __ovld __cnfn min(int2 x, int y);
uint2 __ovld __cnfn min(uint2 x, uint y);
int3 __ovld __cnfn min(int3 x, int y);
@@ -10047,8 +9774,6 @@ int8 __ovld __cnfn min(int8 x, int y);
uint8 __ovld __cnfn min(uint8 x, uint y);
int16 __ovld __cnfn min(int16 x, int y);
uint16 __ovld __cnfn min(uint16 x, uint y);
long __ovld __cnfn min(long x, long y);
ulong __ovld __cnfn min(ulong x, ulong y);
long2 __ovld __cnfn min(long2 x, long y);
ulong2 __ovld __cnfn min(ulong2 x, ulong y);
long3 __ovld __cnfn min(long3 x, long y);
@@ -10627,7 +10352,6 @@ half3 __ovld __cnfn step(half3 edge, half3 x);
half4 __ovld __cnfn step(half4 edge, half4 x);
half8 __ovld __cnfn step(half8 edge, half8 x);
half16 __ovld __cnfn step(half16 edge, half16 x);
half __ovld __cnfn step(half edge, half x);
half2 __ovld __cnfn step(half edge, half2 x);
half3 __ovld __cnfn step(half edge, half3 x);
half4 __ovld __cnfn step(half edge, half4 x);
@@ -10679,7 +10403,6 @@ half3 __ovld __cnfn smoothstep(half3 edge0, half3 edge1, half3 x);
half4 __ovld __cnfn smoothstep(half4 edge0, half4 edge1, half4 x);
half8 __ovld __cnfn smoothstep(half8 edge0, half8 edge1, half8 x);
half16 __ovld __cnfn smoothstep(half16 edge0, half16 edge1, half16 x);
half __ovld __cnfn smoothstep(half edge0, half edge1, half x);
half2 __ovld __cnfn smoothstep(half edge0, half edge1, half2 x);
half3 __ovld __cnfn smoothstep(half edge0, half edge1, half3 x);
half4 __ovld __cnfn smoothstep(half edge0, half edge1, half4 x);
@@ -12777,30 +12500,6 @@ void __ovld vstorea_half16_rtn(double16 data,size_t offset, __private half *p);
// OpenCL v1.1 s6.11.8, v1.2 s6.12.8, v2.0 s6.13.8 - Synchronization Functions
// Flag type and values for barrier, mem_fence, read_mem_fence, write_mem_fence
typedef uint cl_mem_fence_flags;
/**
* Queue a memory fence to ensure correct
* ordering of memory operations to local memory
*/
#define CLK_LOCAL_MEM_FENCE 0x01
/**
* Queue a memory fence to ensure correct
* ordering of memory operations to global memory
*/
#define CLK_GLOBAL_MEM_FENCE 0x02
#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
/**
* Queue a memory fence to ensure correct ordering of memory
* operations between work-items of a work-group to
* image memory.
*/
#define CLK_IMAGE_MEM_FENCE 0x04
#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
/**
* All work-items in a work-group executing the kernel
* on a processor must execute this function before any
@@ -12834,17 +12533,6 @@ typedef uint cl_mem_fence_flags;
void __ovld __conv barrier(cl_mem_fence_flags flags);
#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
typedef enum memory_scope {
memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups)
memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
#endif
} memory_scope;
void __ovld __conv work_group_barrier(cl_mem_fence_flags flags, memory_scope scope);
void __ovld __conv work_group_barrier(cl_mem_fence_flags flags);
#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
@@ -13341,6 +13029,10 @@ int __ovld atomic_add(volatile __global int *p, int val);
unsigned int __ovld atomic_add(volatile __global unsigned int *p, unsigned int val);
int __ovld atomic_add(volatile __local int *p, int val);
unsigned int __ovld atomic_add(volatile __local unsigned int *p, unsigned int val);
#ifdef __OPENCL_CPP_VERSION__
int __ovld atomic_add(volatile int *p, int val);
unsigned int __ovld atomic_add(volatile unsigned int *p, unsigned int val);
#endif
#if defined(cl_khr_global_int32_base_atomics)
int __ovld atom_add(volatile __global int *p, int val);
@@ -13367,6 +13059,10 @@ int __ovld atomic_sub(volatile __global int *p, int val);
unsigned int __ovld atomic_sub(volatile __global unsigned int *p, unsigned int val);
int __ovld atomic_sub(volatile __local int *p, int val);
unsigned int __ovld atomic_sub(volatile __local unsigned int *p, unsigned int val);
#ifdef __OPENCL_CPP_VERSION__
int __ovld atomic_sub(volatile int *p, int val);
unsigned int __ovld atomic_sub(volatile unsigned int *p, unsigned int val);
#endif
#if defined(cl_khr_global_int32_base_atomics)
int __ovld atom_sub(volatile __global int *p, int val);
@@ -13395,6 +13091,11 @@ int __ovld atomic_xchg(volatile __local int *p, int val);
unsigned int __ovld atomic_xchg(volatile __local unsigned int *p, unsigned int val);
float __ovld atomic_xchg(volatile __global float *p, float val);
float __ovld atomic_xchg(volatile __local float *p, float val);
#ifdef __OPENCL_CPP_VERSION__
int __ovld atomic_xchg(volatile int *p, int val);
unsigned int __ovld atomic_xchg(volatile unsigned int *p, unsigned int val);
float __ovld atomic_xchg(volatile float *p, float val);
#endif
#if defined(cl_khr_global_int32_base_atomics)
int __ovld atom_xchg(volatile __global int *p, int val);
@@ -13422,6 +13123,10 @@ int __ovld atomic_inc(volatile __global int *p);
unsigned int __ovld atomic_inc(volatile __global unsigned int *p);
int __ovld atomic_inc(volatile __local int *p);
unsigned int __ovld atomic_inc(volatile __local unsigned int *p);
#ifdef __OPENCL_CPP_VERSION__
int __ovld atomic_inc(volatile int *p);
unsigned int __ovld atomic_inc(volatile unsigned int *p);
#endif
#if defined(cl_khr_global_int32_base_atomics)
int __ovld atom_inc(volatile __global int *p);
@@ -13449,6 +13154,10 @@ int __ovld atomic_dec(volatile __global int *p);
unsigned int __ovld atomic_dec(volatile __global unsigned int *p);
int __ovld atomic_dec(volatile __local int *p);
unsigned int __ovld atomic_dec(volatile __local unsigned int *p);
#ifdef __OPENCL_CPP_VERSION__
int __ovld atomic_dec(volatile int *p);
unsigned int __ovld atomic_dec(volatile unsigned int *p);
#endif
#if defined(cl_khr_global_int32_base_atomics)
int __ovld atom_dec(volatile __global int *p);
@@ -13477,6 +13186,10 @@ int __ovld atomic_cmpxchg(volatile __global int *p, int cmp, int val);
unsigned int __ovld atomic_cmpxchg(volatile __global unsigned int *p, unsigned int cmp, unsigned int val);
int __ovld atomic_cmpxchg(volatile __local int *p, int cmp, int val);
unsigned int __ovld atomic_cmpxchg(volatile __local unsigned int *p, unsigned int cmp, unsigned int val);
#ifdef __OPENCL_CPP_VERSION__
int __ovld atomic_cmpxchg(volatile int *p, int cmp, int val);
unsigned int __ovld atomic_cmpxchg(volatile unsigned int *p, unsigned int cmp, unsigned int val);
#endif
#if defined(cl_khr_global_int32_base_atomics)
int __ovld atom_cmpxchg(volatile __global int *p, int cmp, int val);
@@ -13505,6 +13218,10 @@ int __ovld atomic_min(volatile __global int *p, int val);
unsigned int __ovld atomic_min(volatile __global unsigned int *p, unsigned int val);
int __ovld atomic_min(volatile __local int *p, int val);
unsigned int __ovld atomic_min(volatile __local unsigned int *p, unsigned int val);
#ifdef __OPENCL_CPP_VERSION__
int __ovld atomic_min(volatile int *p, int val);
unsigned int __ovld atomic_min(volatile unsigned int *p, unsigned int val);
#endif
#if defined(cl_khr_global_int32_extended_atomics)
int __ovld atom_min(volatile __global int *p, int val);
@@ -13533,6 +13250,10 @@ int __ovld atomic_max(volatile __global int *p, int val);
unsigned int __ovld atomic_max(volatile __global unsigned int *p, unsigned int val);
int __ovld atomic_max(volatile __local int *p, int val);
unsigned int __ovld atomic_max(volatile __local unsigned int *p, unsigned int val);
#ifdef __OPENCL_CPP_VERSION__
int __ovld atomic_max(volatile int *p, int val);
unsigned int __ovld atomic_max(volatile unsigned int *p, unsigned int val);
#endif
#if defined(cl_khr_global_int32_extended_atomics)
int __ovld atom_max(volatile __global int *p, int val);
@@ -13560,6 +13281,10 @@ int __ovld atomic_and(volatile __global int *p, int val);
unsigned int __ovld atomic_and(volatile __global unsigned int *p, unsigned int val);
int __ovld atomic_and(volatile __local int *p, int val);
unsigned int __ovld atomic_and(volatile __local unsigned int *p, unsigned int val);
#ifdef __OPENCL_CPP_VERSION__
int __ovld atomic_and(volatile int *p, int val);
unsigned int __ovld atomic_and(volatile unsigned int *p, unsigned int val);
#endif
#if defined(cl_khr_global_int32_extended_atomics)
int __ovld atom_and(volatile __global int *p, int val);
@@ -13587,6 +13312,10 @@ int __ovld atomic_or(volatile __global int *p, int val);
unsigned int __ovld atomic_or(volatile __global unsigned int *p, unsigned int val);
int __ovld atomic_or(volatile __local int *p, int val);
unsigned int __ovld atomic_or(volatile __local unsigned int *p, unsigned int val);
#ifdef __OPENCL_CPP_VERSION__
int __ovld atomic_or(volatile int *p, int val);
unsigned int __ovld atomic_or(volatile unsigned int *p, unsigned int val);
#endif
#if defined(cl_khr_global_int32_extended_atomics)
int __ovld atom_or(volatile __global int *p, int val);
@@ -13614,6 +13343,10 @@ int __ovld atomic_xor(volatile __global int *p, int val);
unsigned int __ovld atomic_xor(volatile __global unsigned int *p, unsigned int val);
int __ovld atomic_xor(volatile __local int *p, int val);
unsigned int __ovld atomic_xor(volatile __local unsigned int *p, unsigned int val);
#ifdef __OPENCL_CPP_VERSION__
int __ovld atomic_xor(volatile int *p, int val);
unsigned int __ovld atomic_xor(volatile unsigned int *p, unsigned int val);
#endif
#if defined(cl_khr_global_int32_extended_atomics)
int __ovld atom_xor(volatile __global int *p, int val);
@@ -13639,20 +13372,6 @@ unsigned long __ovld atom_xor(volatile __local unsigned long *p, unsigned long v
// OpenCL v2.0 s6.13.11 - Atomics Functions
#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
#ifndef ATOMIC_VAR_INIT
#define ATOMIC_VAR_INIT(x) (x)
#endif //ATOMIC_VAR_INIT
#define ATOMIC_FLAG_INIT 0
// enum values aligned with what clang uses in EmitAtomicExpr()
typedef enum memory_order
{
memory_order_relaxed = __ATOMIC_RELAXED,
memory_order_acquire = __ATOMIC_ACQUIRE,
memory_order_release = __ATOMIC_RELEASE,
memory_order_acq_rel = __ATOMIC_ACQ_REL,
memory_order_seq_cst = __ATOMIC_SEQ_CST
} memory_order;
// double atomics support requires extensions cl_khr_int64_base_atomics and cl_khr_int64_extended_atomics
#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
@@ -14470,33 +14189,11 @@ half16 __ovld __cnfn shuffle2(half16 x, half16 y, ushort16 mask);
#if __OPENCL_C_VERSION__ >= CL_VERSION_1_2
// OpenCL v1.2 s6.12.13, v2.0 s6.13.13 - printf
int printf(__constant const char* st, ...);
int printf(__constant const char* st, ...) __attribute__((format(printf, 1, 2)));
#endif
// OpenCL v1.1 s6.11.3, v1.2 s6.12.14, v2.0 s6.13.14 - Image Read and Write Functions
// These values need to match the runtime equivalent
//
// Addressing Mode.
//
#define CLK_ADDRESS_NONE 0
#define CLK_ADDRESS_CLAMP_TO_EDGE 2
#define CLK_ADDRESS_CLAMP 4
#define CLK_ADDRESS_REPEAT 6
#define CLK_ADDRESS_MIRRORED_REPEAT 8
//
// Coordination Normalization
//
#define CLK_NORMALIZED_COORDS_FALSE 0
#define CLK_NORMALIZED_COORDS_TRUE 1
//
// Filtering Mode.
//
#define CLK_FILTER_NEAREST 0x10
#define CLK_FILTER_LINEAR 0x20
#ifdef cl_khr_gl_msaa_sharing
#pragma OPENCL EXTENSION cl_khr_gl_msaa_sharing : enable
#endif //cl_khr_gl_msaa_sharing
@@ -14712,30 +14409,6 @@ float4 __purefn __ovld read_imagef(read_only image3d_t image, sampler_t sampler,
int4 __purefn __ovld read_imagei(read_only image3d_t image, sampler_t sampler, float4 coord, float4 gradientX, float4 gradientY);
uint4 __purefn __ovld read_imageui(read_only image3d_t image, sampler_t sampler, float4 coord, float4 gradientX, float4 gradientY);
float4 __purefn __ovld read_imagef(read_only image1d_t image, sampler_t sampler, float coord, float lod);
int4 __purefn __ovld read_imagei(read_only image1d_t image, sampler_t sampler, float coord, float lod);
uint4 __purefn __ovld read_imageui(read_only image1d_t image, sampler_t sampler, float coord, float lod);
float4 __purefn __ovld read_imagef(read_only image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
int4 __purefn __ovld read_imagei(read_only image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
uint4 __purefn __ovld read_imageui(read_only image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
float4 __purefn __ovld read_imagef(read_only image2d_t image, sampler_t sampler, float2 coord, float lod);
int4 __purefn __ovld read_imagei(read_only image2d_t image, sampler_t sampler, float2 coord, float lod);
uint4 __purefn __ovld read_imageui(read_only image2d_t image, sampler_t sampler, float2 coord, float lod);
float __purefn __ovld read_imagef(read_only image2d_depth_t image, sampler_t sampler, float2 coord, float lod);
float4 __purefn __ovld read_imagef(read_only image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
int4 __purefn __ovld read_imagei(read_only image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
uint4 __purefn __ovld read_imageui(read_only image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
float __purefn __ovld read_imagef(read_only image2d_array_depth_t image, sampler_t sampler, float4 coord, float lod);
float4 __purefn __ovld read_imagef(read_only image3d_t image, sampler_t sampler, float4 coord, float lod);
int4 __purefn __ovld read_imagei(read_only image3d_t image, sampler_t sampler, float4 coord, float lod);
uint4 __purefn __ovld read_imageui(read_only image3d_t image, sampler_t sampler, float4 coord, float lod);
#endif //cl_khr_mipmap_image
#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
@@ -14895,29 +14568,6 @@ float4 __purefn __ovld read_imagef(read_write image3d_t image, sampler_t sampler
int4 __purefn __ovld read_imagei(read_write image3d_t image, sampler_t sampler, float4 coord, float4 gradientX, float4 gradientY);
uint4 __purefn __ovld read_imageui(read_write image3d_t image, sampler_t sampler, float4 coord, float4 gradientX, float4 gradientY);
float4 __purefn __ovld read_imagef(read_write image1d_t image, sampler_t sampler, float coord, float lod);
int4 __purefn __ovld read_imagei(read_write image1d_t image, sampler_t sampler, float coord, float lod);
uint4 __purefn __ovld read_imageui(read_write image1d_t image, sampler_t sampler, float coord, float lod);
float4 __purefn __ovld read_imagef(read_write image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
int4 __purefn __ovld read_imagei(read_write image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
uint4 __purefn __ovld read_imageui(read_write image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
float4 __purefn __ovld read_imagef(read_write image2d_t image, sampler_t sampler, float2 coord, float lod);
int4 __purefn __ovld read_imagei(read_write image2d_t image, sampler_t sampler, float2 coord, float lod);
uint4 __purefn __ovld read_imageui(read_write image2d_t image, sampler_t sampler, float2 coord, float lod);
float __purefn __ovld read_imagef(read_write image2d_depth_t image, sampler_t sampler, float2 coord, float lod);
float4 __purefn __ovld read_imagef(read_write image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
int4 __purefn __ovld read_imagei(read_write image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
uint4 __purefn __ovld read_imageui(read_write image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
float __purefn __ovld read_imagef(read_write image2d_array_depth_t image, sampler_t sampler, float4 coord, float lod);
float4 __purefn __ovld read_imagef(read_write image3d_t image, sampler_t sampler, float4 coord, float lod);
int4 __purefn __ovld read_imagei(read_write image3d_t image, sampler_t sampler, float4 coord, float lod);
uint4 __purefn __ovld read_imageui(read_write image3d_t image, sampler_t sampler, float4 coord, float lod);
#endif //cl_khr_mipmap_image
#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
@@ -15332,26 +14982,6 @@ int __ovld get_image_num_mip_levels(read_write image2d_depth_t image);
* CLK_FLOAT
*/
//
// Channel Datatype.
//
#define CLK_SNORM_INT8 0x10D0
#define CLK_SNORM_INT16 0x10D1
#define CLK_UNORM_INT8 0x10D2
#define CLK_UNORM_INT16 0x10D3
#define CLK_UNORM_SHORT_565 0x10D4
#define CLK_UNORM_SHORT_555 0x10D5
#define CLK_UNORM_INT_101010 0x10D6
#define CLK_SIGNED_INT8 0x10D7
#define CLK_SIGNED_INT16 0x10D8
#define CLK_SIGNED_INT32 0x10D9
#define CLK_UNSIGNED_INT8 0x10DA
#define CLK_UNSIGNED_INT16 0x10DB
#define CLK_UNSIGNED_INT32 0x10DC
#define CLK_HALF_FLOAT 0x10DD
#define CLK_FLOAT 0x10DE
#define CLK_UNORM_INT24 0x10DF
int __ovld __cnfn get_image_channel_data_type(read_only image1d_t image);
int __ovld __cnfn get_image_channel_data_type(read_only image1d_buffer_t image);
int __ovld __cnfn get_image_channel_data_type(read_only image2d_t image);
@@ -15423,30 +15053,6 @@ int __ovld __cnfn get_image_channel_data_type(read_write image2d_array_msaa_dept
* CLK_INTENSITY
* CLK_LUMINANCE
*/
// Channel order, numbering must be aligned with cl_channel_order in cl.h
//
#define CLK_R 0x10B0
#define CLK_A 0x10B1
#define CLK_RG 0x10B2
#define CLK_RA 0x10B3
#define CLK_RGB 0x10B4
#define CLK_RGBA 0x10B5
#define CLK_BGRA 0x10B6
#define CLK_ARGB 0x10B7
#define CLK_INTENSITY 0x10B8
#define CLK_LUMINANCE 0x10B9
#define CLK_Rx 0x10BA
#define CLK_RGx 0x10BB
#define CLK_RGBx 0x10BC
#define CLK_DEPTH 0x10BD
#define CLK_DEPTH_STENCIL 0x10BE
#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
#define CLK_sRGB 0x10BF
#define CLK_sRGBx 0x10C0
#define CLK_sRGBA 0x10C1
#define CLK_sBGRA 0x10C2
#define CLK_ABGR 0x10C3
#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
int __ovld __cnfn get_image_channel_order(read_only image1d_t image);
int __ovld __cnfn get_image_channel_order(read_only image1d_buffer_t image);
@@ -15605,20 +15211,17 @@ size_t __ovld __cnfn get_image_array_size(read_write image2d_array_msaa_depth_t
#if defined(cl_khr_gl_msaa_sharing)
int __ovld get_image_num_samples(read_only image2d_msaa_t image);
int __ovld get_image_num_samples(read_only image2d_msaa_depth_t image);
int __ovld get_image_num_samples(read_only image2d_array_msaa_depth_t image);
int __ovld get_image_num_samples(read_only image2d_array_msaa_t image);
int __ovld get_image_num_samples(read_only image2d_array_msaa_depth_t image);
int __ovld get_image_num_samples(write_only image2d_msaa_t image);
int __ovld get_image_num_samples(write_only image2d_msaa_depth_t image);
int __ovld get_image_num_samples(write_only image2d_array_msaa_depth_t image);
int __ovld get_image_num_samples(write_only image2d_array_msaa_t image);
int __ovld get_image_num_samples(write_only image2d_array_msaa_depth_t image);
#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
int __ovld get_image_num_samples(read_write image2d_msaa_t image);
int __ovld get_image_num_samples(read_write image2d_msaa_depth_t image);
int __ovld get_image_num_samples(read_write image2d_array_msaa_depth_t image);
int __ovld get_image_num_samples(read_write image2d_array_msaa_t image);
int __ovld get_image_num_samples(read_write image2d_array_msaa_depth_t image);
#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
@@ -15728,7 +15331,6 @@ double __ovld __conv work_group_scan_inclusive_max(double x);
// OpenCL v2.0 s6.13.16 - Pipe Functions
#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
#define CLK_NULL_RESERVE_ID (__builtin_astype(((void*)(__SIZE_MAX__)), reserve_id_t))
bool __ovld is_valid_reserve_id(reserve_id_t reserve_id);
#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
@@ -15736,44 +15338,6 @@ bool __ovld is_valid_reserve_id(reserve_id_t reserve_id);
// OpenCL v2.0 s6.13.17 - Enqueue Kernels
#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
#define CL_COMPLETE 0x0
#define CL_RUNNING 0x1
#define CL_SUBMITTED 0x2
#define CL_QUEUED 0x3
#define CLK_SUCCESS 0
#define CLK_ENQUEUE_FAILURE -101
#define CLK_INVALID_QUEUE -102
#define CLK_INVALID_NDRANGE -160
#define CLK_INVALID_EVENT_WAIT_LIST -57
#define CLK_DEVICE_QUEUE_FULL -161
#define CLK_INVALID_ARG_SIZE -51
#define CLK_EVENT_ALLOCATION_FAILURE -100
#define CLK_OUT_OF_RESOURCES -5
#define CLK_NULL_QUEUE 0
#define CLK_NULL_EVENT (__builtin_astype(((void*)(__SIZE_MAX__)), clk_event_t))
// execution model related definitions
#define CLK_ENQUEUE_FLAGS_NO_WAIT 0x0
#define CLK_ENQUEUE_FLAGS_WAIT_KERNEL 0x1
#define CLK_ENQUEUE_FLAGS_WAIT_WORK_GROUP 0x2
typedef int kernel_enqueue_flags_t;
typedef int clk_profiling_info;
// Profiling info name (see capture_event_profiling_info)
#define CLK_PROFILING_COMMAND_EXEC_TIME 0x1
#define MAX_WORK_DIM 3
typedef struct {
unsigned int workDimension;
size_t globalWorkOffset[MAX_WORK_DIM];
size_t globalWorkSize[MAX_WORK_DIM];
size_t localWorkSize[MAX_WORK_DIM];
} ndrange_t;
ndrange_t __ovld ndrange_1D(size_t);
ndrange_t __ovld ndrange_1D(size_t, size_t);
ndrange_t __ovld ndrange_1D(size_t, size_t, size_t);
@@ -16216,138 +15780,6 @@ void __ovld __conv intel_sub_group_block_write_us8( __global ushort* p, u
#ifdef cl_intel_device_side_avc_motion_estimation
#pragma OPENCL EXTENSION cl_intel_device_side_avc_motion_estimation : begin
#define CLK_AVC_ME_MAJOR_16x16_INTEL 0x0
#define CLK_AVC_ME_MAJOR_16x8_INTEL 0x1
#define CLK_AVC_ME_MAJOR_8x16_INTEL 0x2
#define CLK_AVC_ME_MAJOR_8x8_INTEL 0x3
#define CLK_AVC_ME_MINOR_8x8_INTEL 0x0
#define CLK_AVC_ME_MINOR_8x4_INTEL 0x1
#define CLK_AVC_ME_MINOR_4x8_INTEL 0x2
#define CLK_AVC_ME_MINOR_4x4_INTEL 0x3
#define CLK_AVC_ME_MAJOR_FORWARD_INTEL 0x0
#define CLK_AVC_ME_MAJOR_BACKWARD_INTEL 0x1
#define CLK_AVC_ME_MAJOR_BIDIRECTIONAL_INTEL 0x2
#define CLK_AVC_ME_PARTITION_MASK_ALL_INTEL 0x0
#define CLK_AVC_ME_PARTITION_MASK_16x16_INTEL 0x7E
#define CLK_AVC_ME_PARTITION_MASK_16x8_INTEL 0x7D
#define CLK_AVC_ME_PARTITION_MASK_8x16_INTEL 0x7B
#define CLK_AVC_ME_PARTITION_MASK_8x8_INTEL 0x77
#define CLK_AVC_ME_PARTITION_MASK_8x4_INTEL 0x6F
#define CLK_AVC_ME_PARTITION_MASK_4x8_INTEL 0x5F
#define CLK_AVC_ME_PARTITION_MASK_4x4_INTEL 0x3F
#define CLK_AVC_ME_SLICE_TYPE_PRED_INTEL 0x0
#define CLK_AVC_ME_SLICE_TYPE_BPRED_INTEL 0x1
#define CLK_AVC_ME_SLICE_TYPE_INTRA_INTEL 0x2
#define CLK_AVC_ME_SEARCH_WINDOW_EXHAUSTIVE_INTEL 0x0
#define CLK_AVC_ME_SEARCH_WINDOW_SMALL_INTEL 0x1
#define CLK_AVC_ME_SEARCH_WINDOW_TINY_INTEL 0x2
#define CLK_AVC_ME_SEARCH_WINDOW_EXTRA_TINY_INTEL 0x3
#define CLK_AVC_ME_SEARCH_WINDOW_DIAMOND_INTEL 0x4
#define CLK_AVC_ME_SEARCH_WINDOW_LARGE_DIAMOND_INTEL 0x5
#define CLK_AVC_ME_SEARCH_WINDOW_RESERVED0_INTEL 0x6
#define CLK_AVC_ME_SEARCH_WINDOW_RESERVED1_INTEL 0x7
#define CLK_AVC_ME_SEARCH_WINDOW_CUSTOM_INTEL 0x8
#define CLK_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL 0x0
#define CLK_AVC_ME_SAD_ADJUST_MODE_HAAR_INTEL 0x2
#define CLK_AVC_ME_SUBPIXEL_MODE_INTEGER_INTEL 0x0
#define CLK_AVC_ME_SUBPIXEL_MODE_HPEL_INTEL 0x1
#define CLK_AVC_ME_SUBPIXEL_MODE_QPEL_INTEL 0x3
#define CLK_AVC_ME_COST_PRECISION_QPEL_INTEL 0x0
#define CLK_AVC_ME_COST_PRECISION_HPEL_INTEL 0x1
#define CLK_AVC_ME_COST_PRECISION_PEL_INTEL 0x2
#define CLK_AVC_ME_COST_PRECISION_DPEL_INTEL 0x3
#define CLK_AVC_ME_BIDIR_WEIGHT_QUARTER_INTEL 0x10
#define CLK_AVC_ME_BIDIR_WEIGHT_THIRD_INTEL 0x15
#define CLK_AVC_ME_BIDIR_WEIGHT_HALF_INTEL 0x20
#define CLK_AVC_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL 0x2B
#define CLK_AVC_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL 0x30
#define CLK_AVC_ME_BORDER_REACHED_LEFT_INTEL 0x0
#define CLK_AVC_ME_BORDER_REACHED_RIGHT_INTEL 0x2
#define CLK_AVC_ME_BORDER_REACHED_TOP_INTEL 0x4
#define CLK_AVC_ME_BORDER_REACHED_BOTTOM_INTEL 0x8
#define CLK_AVC_ME_INTRA_16x16_INTEL 0x0
#define CLK_AVC_ME_INTRA_8x8_INTEL 0x1
#define CLK_AVC_ME_INTRA_4x4_INTEL 0x2
#define CLK_AVC_ME_SKIP_BLOCK_PARTITION_16x16_INTEL 0x0
#define CLK_AVC_ME_SKIP_BLOCK_PARTITION_8x8_INTEL 0x4000
#define CLK_AVC_ME_SKIP_BLOCK_16x16_FORWARD_ENABLE_INTEL (0x1 << 24)
#define CLK_AVC_ME_SKIP_BLOCK_16x16_BACKWARD_ENABLE_INTEL (0x2 << 24)
#define CLK_AVC_ME_SKIP_BLOCK_16x16_DUAL_ENABLE_INTEL (0x3 << 24)
#define CLK_AVC_ME_SKIP_BLOCK_8x8_FORWARD_ENABLE_INTEL (0x55 << 24)
#define CLK_AVC_ME_SKIP_BLOCK_8x8_BACKWARD_ENABLE_INTEL (0xAA << 24)
#define CLK_AVC_ME_SKIP_BLOCK_8x8_DUAL_ENABLE_INTEL (0xFF << 24)
#define CLK_AVC_ME_SKIP_BLOCK_8x8_0_FORWARD_ENABLE_INTEL (0x1 << 24)
#define CLK_AVC_ME_SKIP_BLOCK_8x8_0_BACKWARD_ENABLE_INTEL (0x2 << 24)
#define CLK_AVC_ME_SKIP_BLOCK_8x8_1_FORWARD_ENABLE_INTEL (0x1 << 26)
#define CLK_AVC_ME_SKIP_BLOCK_8x8_1_BACKWARD_ENABLE_INTEL (0x2 << 26)
#define CLK_AVC_ME_SKIP_BLOCK_8x8_2_FORWARD_ENABLE_INTEL (0x1 << 28)
#define CLK_AVC_ME_SKIP_BLOCK_8x8_2_BACKWARD_ENABLE_INTEL (0x2 << 28)
#define CLK_AVC_ME_SKIP_BLOCK_8x8_3_FORWARD_ENABLE_INTEL (0x1 << 30)
#define CLK_AVC_ME_SKIP_BLOCK_8x8_3_BACKWARD_ENABLE_INTEL (0x2 << 30)
#define CLK_AVC_ME_BLOCK_BASED_SKIP_4x4_INTEL 0x00
#define CLK_AVC_ME_BLOCK_BASED_SKIP_8x8_INTEL 0x80
#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_ALL_INTEL 0x0
#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL 0x6
#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL 0x5
#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL 0x3
#define CLK_AVC_ME_INTRA_NEIGHBOR_LEFT_MASK_ENABLE_INTEL 0x60
#define CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_MASK_ENABLE_INTEL 0x10
#define CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_RIGHT_MASK_ENABLE_INTEL 0x8
#define CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_LEFT_MASK_ENABLE_INTEL 0x4
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL 0x0
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_DC_INTEL 0x2
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL 0x3
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL 0x4
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL 0x5
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL 0x6
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL 0x7
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL 0x8
#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_DC_INTEL 0x0
#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1
#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL 0x2
#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL 0x3
#define CLK_AVC_ME_FRAME_FORWARD_INTEL 0x1
#define CLK_AVC_ME_FRAME_BACKWARD_INTEL 0x2
#define CLK_AVC_ME_FRAME_DUAL_INTEL 0x3
#define CLK_AVC_ME_INTERLACED_SCAN_TOP_FIELD_INTEL 0x0
#define CLK_AVC_ME_INTERLACED_SCAN_BOTTOM_FIELD_INTEL 0x1
#define CLK_AVC_ME_INITIALIZE_INTEL 0x0
#define CLK_AVC_IME_PAYLOAD_INITIALIZE_INTEL 0x0
#define CLK_AVC_REF_PAYLOAD_INITIALIZE_INTEL 0x0
#define CLK_AVC_SIC_PAYLOAD_INITIALIZE_INTEL 0x0
#define CLK_AVC_IME_RESULT_INITIALIZE_INTEL 0x0
#define CLK_AVC_REF_RESULT_INITIALIZE_INTEL 0x0
#define CLK_AVC_SIC_RESULT_INITIALIZE_INTEL 0x0
#define CLK_AVC_IME_RESULT_SINGLE_REFERENCE_STREAMOUT_INITIALIZE_INTEL 0x0
#define CLK_AVC_IME_RESULT_SINGLE_REFERENCE_STREAMIN_INITIALIZE_INTEL 0x0
#define CLK_AVC_IME_RESULT_DUAL_REFERENCE_STREAMOUT_INITIALIZE_INTEL 0x0
#define CLK_AVC_IME_RESULT_DUAL_REFERENCE_STREAMIN_INITIALIZE_INTEL 0x0
// MCE built-in functions
uchar __ovld
intel_sub_group_avc_mce_get_default_inter_base_multi_reference_penalty(
@@ -17034,6 +16466,34 @@ uint8 __ovld amd_sadw(uint8 src0, uint8 src1, uint8 src2);
uint16 __ovld amd_sadw(uint16 src0, uint16 src1, uint16 src2);
#endif // cl_amd_media_ops2
#if defined(cl_arm_integer_dot_product_int8)
#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : begin
uint __ovld arm_dot(uchar4 a, uchar4 b);
int __ovld arm_dot(char4 a, char4 b);
#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : end
#endif // defined(cl_arm_integer_dot_product_int8)
#if defined(cl_arm_integer_dot_product_accumulate_int8)
#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : begin
uint __ovld arm_dot_acc(uchar4 a, uchar4 b, uint c);
int __ovld arm_dot_acc(char4 a, char4 b, int c);
#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : end
#endif // defined(cl_arm_integer_dot_product_accumulate_int8)
#if defined(cl_arm_integer_dot_product_accumulate_int16)
#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int16 : begin
uint __ovld arm_dot_acc(ushort2 a, ushort2 b, uint c);
int __ovld arm_dot_acc(short2 a, short2 b, int c);
#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int16 : end
#endif // defined(cl_arm_integer_dot_product_accumulate_int16)
#if defined(cl_arm_integer_dot_product_accumulate_saturate_int8)
#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_saturate_int8 : begin
uint __ovld arm_dot_acc_sat(uchar4 a, uchar4 b, uint c);
int __ovld arm_dot_acc_sat(char4 a, char4 b, int c);
#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_saturate_int8 : end
#endif // defined(cl_arm_integer_dot_product_accumulate_saturate_int8)
// Disable any extensions we may have enabled previously.
#pragma OPENCL EXTENSION all : disable
@@ -0,0 +1,35 @@
/*===---- __clang_openmp_math.h - OpenMP target math support ---------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#if defined(__NVPTX__) && defined(_OPENMP)
/// TODO:
/// We are currently reusing the functionality of the Clang-CUDA code path
/// as an alternative to the host declarations provided by math.h and cmath.
/// This is suboptimal.
///
/// We should instead declare the device functions in a similar way, e.g.,
/// through OpenMP 5.0 variants, and afterwards populate the module with the
/// host declarations by unconditionally including the host math.h or cmath,
/// respectively. This is actually what the Clang-CUDA code path does, using
/// __device__ instead of variants to avoid redeclarations and get the desired
/// overload resolution.
#define __CUDA__
#if defined(__cplusplus)
#include <__clang_cuda_cmath.h>
#endif
#undef __CUDA__
/// Magic macro for stopping the math.h/cmath host header from being included.
#define __CLANG_NO_HOST_MATH__
#endif
@@ -0,0 +1,33 @@
/*===---- __clang_openmp_math_declares.h - OpenMP math declares ------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __CLANG_OPENMP_MATH_DECLARES_H__
#define __CLANG_OPENMP_MATH_DECLARES_H__
#ifndef _OPENMP
#error "This file is for OpenMP compilation only."
#endif
#if defined(__NVPTX__) && defined(_OPENMP)
#define __CUDA__
#if defined(__cplusplus)
#include <__clang_cuda_math_forward_declares.h>
#endif
/// Include declarations for libdevice functions.
#include <__clang_cuda_libdevice_declares.h>
/// Provide definitions for these functions.
#include <__clang_cuda_device_functions.h>
#undef __CUDA__
#endif
#endif
+16
View File
@@ -0,0 +1,16 @@
/*===-------------- cmath - Alternative cmath header -----------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#include <__clang_openmp_math.h>
#ifndef __CLANG_NO_HOST_MATH__
#include_next <cmath>
#else
#undef __CLANG_NO_HOST_MATH__
#endif
+17
View File
@@ -0,0 +1,17 @@
/*===------------- math.h - Alternative math.h header ----------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#include <__clang_openmp_math.h>
#ifndef __CLANG_NO_HOST_MATH__
#include_next <math.h>
#else
#undef __CLANG_NO_HOST_MATH__
#endif
+7 -17
View File
@@ -1,22 +1,8 @@
/*===---- pconfigintrin.h - X86 platform configuration ---------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
@@ -30,6 +16,8 @@
#define __PCONFIG_KEY_PROGRAM 0x00000001
#if __has_extension(gnu_asm)
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("pconfig")))
@@ -47,4 +35,6 @@ _pconfig_u32(unsigned int __leaf, __SIZE_TYPE__ __d[])
#undef __DEFAULT_FN_ATTRS
#endif /* __has_extension(gnu_asm) */
#endif
+3 -17
View File
@@ -1,23 +1,9 @@
/*===---- pkuintrin.h - PKU intrinsics -------------------------------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+3 -17
View File
@@ -1,22 +1,8 @@
/*===---- pmmintrin.h - SSE3 intrinsics ------------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+3 -49
View File
@@ -1,22 +1,8 @@
/*===---- popcntintrin.h - POPCNT intrinsics -------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
@@ -43,22 +29,6 @@ _mm_popcnt_u32(unsigned int __A)
return __builtin_popcount(__A);
}
/// Counts the number of bits in the source operand having a value of 1.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
///
/// \param __A
/// A signed 32-bit integer operand.
/// \returns A 32-bit integer containing the number of bits with value 1 in the
/// source operand.
static __inline__ int __DEFAULT_FN_ATTRS
_popcnt32(int __A)
{
return __builtin_popcount(__A);
}
#ifdef __x86_64__
/// Counts the number of bits in the source operand having a value of 1.
///
@@ -75,22 +45,6 @@ _mm_popcnt_u64(unsigned long long __A)
{
return __builtin_popcountll(__A);
}
/// Counts the number of bits in the source operand having a value of 1.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
///
/// \param __A
/// A signed 64-bit integer operand.
/// \returns A 64-bit integer containing the number of bits with value 1 in the
/// source operand.
static __inline__ long long __DEFAULT_FN_ATTRS
_popcnt64(long long __A)
{
return __builtin_popcountll(__A);
}
#endif /* __x86_64__ */
#undef __DEFAULT_FN_ATTRS
+2318
View File
@@ -0,0 +1,2318 @@
/*===---- emmintrin.h - Implementation of SSE2 intrinsics on PowerPC -------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
/* Implemented from the specification included in the Intel C++ Compiler
User Guide and Reference, version 9.0. */
#ifndef NO_WARN_X86_INTRINSICS
/* This header file is to help porting code using Intel intrinsics
explicitly from x86_64 to powerpc64/powerpc64le.
Since X86 SSE2 intrinsics mainly handles __m128i and __m128d type,
PowerPC VMX/VSX ISA is a good match for vector float SIMD operations.
However scalar float operations in vector (XMM) registers require
the POWER8 VSX ISA (2.07) level. There are differences for data
format and placement of float scalars in the vector register, which
require extra steps to match SSE2 scalar float semantics on POWER.
It should be noted that there's much difference between X86_64's
MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
portable <fenv.h> instead of access MXSCR directly.
Most SSE2 scalar float intrinsic operations can be performed more
efficiently as C language float scalar operations or optimized to
use vector SIMD operations. We recommend this for new applications.
*/
#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
#endif
#ifndef EMMINTRIN_H_
#define EMMINTRIN_H_
#include <altivec.h>
/* We need definitions from the SSE header files. */
#include <xmmintrin.h>
/* SSE2 */
typedef __vector double __v2df;
typedef __vector long long __v2di;
typedef __vector unsigned long long __v2du;
typedef __vector int __v4si;
typedef __vector unsigned int __v4su;
typedef __vector short __v8hi;
typedef __vector unsigned short __v8hu;
typedef __vector signed char __v16qi;
typedef __vector unsigned char __v16qu;
/* The Intel API is flexible enough that we must allow aliasing with other
vector types, and their scalar components. */
typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
/* Unaligned version of the same types. */
typedef long long __m128i_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
/* Define two value permute mask. */
#define _MM_SHUFFLE2(x,y) (((x) << 1) | (y))
/* Create a vector with element 0 as F and the rest zero. */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_sd (double __F)
{
return __extension__ (__m128d){ __F, 0.0 };
}
/* Create a vector with both elements equal to F. */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_pd (double __F)
{
return __extension__ (__m128d){ __F, __F };
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_pd1 (double __F)
{
return _mm_set1_pd (__F);
}
/* Create a vector with the lower value X and upper value W. */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_pd (double __W, double __X)
{
return __extension__ (__m128d){ __X, __W };
}
/* Create a vector with the lower value W and upper value X. */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_pd (double __W, double __X)
{
return __extension__ (__m128d){ __W, __X };
}
/* Create an undefined vector. */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_undefined_pd (void)
{
__m128d __Y = __Y;
return __Y;
}
/* Create a vector of zeros. */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setzero_pd (void)
{
return (__m128d) vec_splats (0);
}
/* Sets the low DPFP value of A from the low value of B. */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_move_sd (__m128d __A, __m128d __B)
{
__v2df result = (__v2df) __A;
result [0] = ((__v2df) __B)[0];
return (__m128d) result;
}
/* Load two DPFP values from P. The address must be 16-byte aligned. */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_pd (double const *__P)
{
return ((__m128d)vec_ld(0, (__v16qu*)__P));
}
/* Load two DPFP values from P. The address need not be 16-byte aligned. */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadu_pd (double const *__P)
{
return (vec_vsx_ld(0, __P));
}
/* Create a vector with all two elements equal to *P. */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load1_pd (double const *__P)
{
return (vec_splats (*__P));
}
/* Create a vector with element 0 as *P and the rest zero. */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_sd (double const *__P)
{
return _mm_set_sd (*__P);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_pd1 (double const *__P)
{
return _mm_load1_pd (__P);
}
/* Load two DPFP values in reverse order. The address must be aligned. */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadr_pd (double const *__P)
{
__v2df __tmp = _mm_load_pd (__P);
return (__m128d)vec_xxpermdi (__tmp, __tmp, 2);
}
/* Store two DPFP values. The address must be 16-byte aligned. */
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_pd (double *__P, __m128d __A)
{
vec_st((__v16qu)__A, 0, (__v16qu*)__P);
}
/* Store two DPFP values. The address need not be 16-byte aligned. */
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeu_pd (double *__P, __m128d __A)
{
*(__m128d_u *)__P = __A;
}
/* Stores the lower DPFP value. */
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_sd (double *__P, __m128d __A)
{
*__P = ((__v2df)__A)[0];
}
extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_f64 (__m128d __A)
{
return ((__v2df)__A)[0];
}
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storel_pd (double *__P, __m128d __A)
{
_mm_store_sd (__P, __A);
}
/* Stores the upper DPFP value. */
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeh_pd (double *__P, __m128d __A)
{
*__P = ((__v2df)__A)[1];
}
/* Store the lower DPFP value across two words.
The address must be 16-byte aligned. */
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store1_pd (double *__P, __m128d __A)
{
_mm_store_pd (__P, vec_splat (__A, 0));
}
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_pd1 (double *__P, __m128d __A)
{
_mm_store1_pd (__P, __A);
}
/* Store two DPFP values in reverse order. The address must be aligned. */
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storer_pd (double *__P, __m128d __A)
{
_mm_store_pd (__P, vec_xxpermdi (__A, __A, 2));
}
/* Intel intrinsic. */
extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi128_si64 (__m128i __A)
{
return ((__v2di)__A)[0];
}
/* Microsoft intrinsic. */
extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi128_si64x (__m128i __A)
{
return ((__v2di)__A)[0];
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_pd (__m128d __A, __m128d __B)
{
return (__m128d) ((__v2df)__A + (__v2df)__B);
}
/* Add the lower double-precision (64-bit) floating-point element in
a and b, store the result in the lower element of dst, and copy
the upper element from a to the upper element of dst. */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_sd (__m128d __A, __m128d __B)
{
__A[0] = __A[0] + __B[0];
return (__A);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_pd (__m128d __A, __m128d __B)
{
return (__m128d) ((__v2df)__A - (__v2df)__B);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_sd (__m128d __A, __m128d __B)
{
__A[0] = __A[0] - __B[0];
return (__A);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_pd (__m128d __A, __m128d __B)
{
return (__m128d) ((__v2df)__A * (__v2df)__B);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_sd (__m128d __A, __m128d __B)
{
__A[0] = __A[0] * __B[0];
return (__A);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_div_pd (__m128d __A, __m128d __B)
{
return (__m128d) ((__v2df)__A / (__v2df)__B);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_div_sd (__m128d __A, __m128d __B)
{
__A[0] = __A[0] / __B[0];
return (__A);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sqrt_pd (__m128d __A)
{
return (vec_sqrt (__A));
}
/* Return pair {sqrt (B[0]), A[1]}. */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sqrt_sd (__m128d __A, __m128d __B)
{
__v2df c;
c = vec_sqrt ((__v2df) _mm_set1_pd (__B[0]));
return (__m128d) _mm_setr_pd (c[0], __A[1]);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_pd (__m128d __A, __m128d __B)
{
return (vec_min (__A, __B));
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_sd (__m128d __A, __m128d __B)
{
__v2df a, b, c;
a = vec_splats (__A[0]);
b = vec_splats (__B[0]);
c = vec_min (a, b);
return (__m128d) _mm_setr_pd (c[0], __A[1]);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_pd (__m128d __A, __m128d __B)
{
return (vec_max (__A, __B));
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_sd (__m128d __A, __m128d __B)
{
__v2df a, b, c;
a = vec_splats (__A[0]);
b = vec_splats (__B[0]);
c = vec_max (a, b);
return (__m128d) _mm_setr_pd (c[0], __A[1]);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_pd (__m128d __A, __m128d __B)
{
return ((__m128d)vec_cmpeq ((__v2df) __A, (__v2df) __B));
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_pd (__m128d __A, __m128d __B)
{
return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B));
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmple_pd (__m128d __A, __m128d __B)
{
return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B));
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_pd (__m128d __A, __m128d __B)
{
return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B));
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpge_pd (__m128d __A, __m128d __B)
{
return ((__m128d)vec_cmpge ((__v2df) __A,(__v2df) __B));
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpneq_pd (__m128d __A, __m128d __B)
{
__v2df temp = (__v2df) vec_cmpeq ((__v2df) __A, (__v2df)__B);
return ((__m128d)vec_nor (temp, temp));
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnlt_pd (__m128d __A, __m128d __B)
{
return ((__m128d)vec_cmpge ((__v2df) __A, (__v2df) __B));
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnle_pd (__m128d __A, __m128d __B)
{
return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B));
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpngt_pd (__m128d __A, __m128d __B)
{
return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B));
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnge_pd (__m128d __A, __m128d __B)
{
return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B));
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpord_pd (__m128d __A, __m128d __B)
{
#if _ARCH_PWR8
__v2du c, d;
/* Compare against self will return false (0's) if NAN. */
c = (__v2du)vec_cmpeq (__A, __A);
d = (__v2du)vec_cmpeq (__B, __B);
#else
__v2du a, b;
__v2du c, d;
const __v2du double_exp_mask = {0x7ff0000000000000, 0x7ff0000000000000};
a = (__v2du)vec_abs ((__v2df)__A);
b = (__v2du)vec_abs ((__v2df)__B);
c = (__v2du)vec_cmpgt (double_exp_mask, a);
d = (__v2du)vec_cmpgt (double_exp_mask, b);
#endif
/* A != NAN and B != NAN. */
return ((__m128d)vec_and(c, d));
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpunord_pd (__m128d __A, __m128d __B)
{
#if _ARCH_PWR8
__v2du c, d;
/* Compare against self will return false (0's) if NAN. */
c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
/* A == NAN OR B == NAN converts too:
NOT(A != NAN) OR NOT(B != NAN). */
c = vec_nor (c, c);
return ((__m128d)vec_orc(c, d));
#else
__v2du c, d;
/* Compare against self will return false (0's) if NAN. */
c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
/* Convert the true ('1's) is NAN. */
c = vec_nor (c, c);
d = vec_nor (d, d);
return ((__m128d)vec_or(c, d));
#endif
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_sd(__m128d __A, __m128d __B)
{
__v2df a, b, c;
/* PowerISA VSX does not allow partial (for just lower double)
results. So to insure we don't generate spurious exceptions
(from the upper double values) we splat the lower double
before we do the operation. */
a = vec_splats (__A[0]);
b = vec_splats (__B[0]);
c = (__v2df) vec_cmpeq(a, b);
/* Then we merge the lower double result with the original upper
double from __A. */
return (__m128d) _mm_setr_pd (c[0], __A[1]);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_sd (__m128d __A, __m128d __B)
{
__v2df a, b, c;
a = vec_splats (__A[0]);
b = vec_splats (__B[0]);
c = (__v2df) vec_cmplt(a, b);
return (__m128d) _mm_setr_pd (c[0], __A[1]);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmple_sd (__m128d __A, __m128d __B)
{
__v2df a, b, c;
a = vec_splats (__A[0]);
b = vec_splats (__B[0]);
c = (__v2df) vec_cmple(a, b);
return (__m128d) _mm_setr_pd (c[0], __A[1]);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_sd (__m128d __A, __m128d __B)
{
__v2df a, b, c;
a = vec_splats (__A[0]);
b = vec_splats (__B[0]);
c = (__v2df) vec_cmpgt(a, b);
return (__m128d) _mm_setr_pd (c[0], __A[1]);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpge_sd (__m128d __A, __m128d __B)
{
__v2df a, b, c;
a = vec_splats (__A[0]);
b = vec_splats (__B[0]);
c = (__v2df) vec_cmpge(a, b);
return (__m128d) _mm_setr_pd (c[0], __A[1]);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpneq_sd (__m128d __A, __m128d __B)
{
__v2df a, b, c;
a = vec_splats (__A[0]);
b = vec_splats (__B[0]);
c = (__v2df) vec_cmpeq(a, b);
c = vec_nor (c, c);
return (__m128d) _mm_setr_pd (c[0], __A[1]);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnlt_sd (__m128d __A, __m128d __B)
{
__v2df a, b, c;
a = vec_splats (__A[0]);
b = vec_splats (__B[0]);
/* Not less than is just greater than or equal. */
c = (__v2df) vec_cmpge(a, b);
return (__m128d) _mm_setr_pd (c[0], __A[1]);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnle_sd (__m128d __A, __m128d __B)
{
__v2df a, b, c;
a = vec_splats (__A[0]);
b = vec_splats (__B[0]);
/* Not less than or equal is just greater than. */
c = (__v2df) vec_cmpge(a, b);
return (__m128d) _mm_setr_pd (c[0], __A[1]);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpngt_sd (__m128d __A, __m128d __B)
{
__v2df a, b, c;
a = vec_splats (__A[0]);
b = vec_splats (__B[0]);
/* Not greater than is just less than or equal. */
c = (__v2df) vec_cmple(a, b);
return (__m128d) _mm_setr_pd (c[0], __A[1]);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnge_sd (__m128d __A, __m128d __B)
{
__v2df a, b, c;
a = vec_splats (__A[0]);
b = vec_splats (__B[0]);
/* Not greater than or equal is just less than. */
c = (__v2df) vec_cmplt(a, b);
return (__m128d) _mm_setr_pd (c[0], __A[1]);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpord_sd (__m128d __A, __m128d __B)
{
__v2df r;
r = (__v2df)_mm_cmpord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
return (__m128d) _mm_setr_pd (r[0], ((__v2df)__A)[1]);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpunord_sd (__m128d __A, __m128d __B)
{
__v2df r;
r = _mm_cmpunord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
return (__m128d) _mm_setr_pd (r[0], __A[1]);
}
/* FIXME
The __mm_comi??_sd and __mm_ucomi??_sd implementations below are
exactly the same because GCC for PowerPC only generates unordered
compares (scalar and vector).
Technically __mm_comieq_sp et all should be using the ordered
compare and signal for QNaNs. The __mm_ucomieq_sd et all should
be OK. */
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comieq_sd (__m128d __A, __m128d __B)
{
return (__A[0] == __B[0]);
}
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comilt_sd (__m128d __A, __m128d __B)
{
return (__A[0] < __B[0]);
}
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comile_sd (__m128d __A, __m128d __B)
{
return (__A[0] <= __B[0]);
}
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comigt_sd (__m128d __A, __m128d __B)
{
return (__A[0] > __B[0]);
}
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comige_sd (__m128d __A, __m128d __B)
{
return (__A[0] >= __B[0]);
}
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comineq_sd (__m128d __A, __m128d __B)
{
return (__A[0] != __B[0]);
}
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomieq_sd (__m128d __A, __m128d __B)
{
return (__A[0] == __B[0]);
}
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomilt_sd (__m128d __A, __m128d __B)
{
return (__A[0] < __B[0]);
}
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomile_sd (__m128d __A, __m128d __B)
{
return (__A[0] <= __B[0]);
}
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomigt_sd (__m128d __A, __m128d __B)
{
return (__A[0] > __B[0]);
}
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomige_sd (__m128d __A, __m128d __B)
{
return (__A[0] >= __B[0]);
}
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomineq_sd (__m128d __A, __m128d __B)
{
return (__A[0] != __B[0]);
}
/* Create a vector of Qi, where i is the element number. */
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi64x (long long __q1, long long __q0)
{
return __extension__ (__m128i)(__v2di){ __q0, __q1 };
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi64 (__m64 __q1, __m64 __q0)
{
return _mm_set_epi64x ((long long)__q1, (long long)__q0);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi32 (int __q3, int __q2, int __q1, int __q0)
{
return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 };
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi16 (short __q7, short __q6, short __q5, short __q4,
short __q3, short __q2, short __q1, short __q0)
{
return __extension__ (__m128i)(__v8hi){
__q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 };
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi8 (char __q15, char __q14, char __q13, char __q12,
char __q11, char __q10, char __q09, char __q08,
char __q07, char __q06, char __q05, char __q04,
char __q03, char __q02, char __q01, char __q00)
{
return __extension__ (__m128i)(__v16qi){
__q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
__q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
};
}
/* Set all of the elements of the vector to A. */
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi64x (long long __A)
{
return _mm_set_epi64x (__A, __A);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi64 (__m64 __A)
{
return _mm_set_epi64 (__A, __A);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi32 (int __A)
{
return _mm_set_epi32 (__A, __A, __A, __A);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi16 (short __A)
{
return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi8 (char __A)
{
return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
__A, __A, __A, __A, __A, __A, __A, __A);
}
/* Create a vector of Qi, where i is the element number.
The parameter order is reversed from the _mm_set_epi* functions. */
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_epi64 (__m64 __q0, __m64 __q1)
{
return _mm_set_epi64 (__q1, __q0);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3)
{
return _mm_set_epi32 (__q3, __q2, __q1, __q0);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3,
short __q4, short __q5, short __q6, short __q7)
{
return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03,
char __q04, char __q05, char __q06, char __q07,
char __q08, char __q09, char __q10, char __q11,
char __q12, char __q13, char __q14, char __q15)
{
return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
__q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
}
/* Create a vector with element 0 as *P and the rest zero. */
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_si128 (__m128i const *__P)
{
return *__P;
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadu_si128 (__m128i_u const *__P)
{
return (__m128i) (vec_vsx_ld(0, (signed int const *)__P));
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadl_epi64 (__m128i_u const *__P)
{
return _mm_set_epi64 ((__m64)0LL, *(__m64 *)__P);
}
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_si128 (__m128i *__P, __m128i __B)
{
vec_st ((__v16qu) __B, 0, (__v16qu*)__P);
}
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeu_si128 (__m128i_u *__P, __m128i __B)
{
*__P = __B;
}
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storel_epi64 (__m128i_u *__P, __m128i __B)
{
*(long long *)__P = ((__v2di)__B)[0];
}
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movepi64_pi64 (__m128i_u __B)
{
return (__m64) ((__v2di)__B)[0];
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movpi64_epi64 (__m64 __A)
{
return _mm_set_epi64 ((__m64)0LL, __A);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_move_epi64 (__m128i __A)
{
return _mm_set_epi64 ((__m64)0LL, (__m64)__A[0]);
}
/* Create an undefined vector. */
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_undefined_si128 (void)
{
__m128i __Y = __Y;
return __Y;
}
/* Create a vector of zeros. */
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setzero_si128 (void)
{
return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 };
}
#ifdef _ARCH_PWR8
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi32_pd (__m128i __A)
{
__v2di val;
/* For LE need to generate Vector Unpack Low Signed Word.
Which is generated from unpackh. */
val = (__v2di)vec_unpackh ((__v4si)__A);
return (__m128d)vec_ctf (val, 0);
}
#endif
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi32_ps (__m128i __A)
{
return ((__m128)vec_ctf((__v4si)__A, 0));
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpd_epi32 (__m128d __A)
{
__v2df rounded = vec_rint (__A);
__v4si result, temp;
const __v4si vzero =
{ 0, 0, 0, 0 };
/* VSX Vector truncate Double-Precision to integer and Convert to
Signed Integer Word format with Saturate. */
__asm__(
"xvcvdpsxws %x0,%x1"
: "=wa" (temp)
: "wa" (rounded)
: );
#ifdef _ARCH_PWR8
temp = vec_mergeo (temp, temp);
result = (__v4si) vec_vpkudum ((__vector long long) temp,
(__vector long long) vzero);
#else
{
const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
}
#endif
return (__m128i) result;
}
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpd_pi32 (__m128d __A)
{
__m128i result = _mm_cvtpd_epi32(__A);
return (__m64) result[0];
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpd_ps (__m128d __A)
{
__v4sf result;
__v4si temp;
const __v4si vzero = { 0, 0, 0, 0 };
__asm__(
"xvcvdpsp %x0,%x1"
: "=wa" (temp)
: "wa" (__A)
: );
#ifdef _ARCH_PWR8
temp = vec_mergeo (temp, temp);
result = (__v4sf) vec_vpkudum ((__vector long long) temp,
(__vector long long) vzero);
#else
{
const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
result = (__v4sf) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
}
#endif
return ((__m128)result);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttpd_epi32 (__m128d __A)
{
__v4si result;
__v4si temp;
const __v4si vzero = { 0, 0, 0, 0 };
/* VSX Vector truncate Double-Precision to integer and Convert to
Signed Integer Word format with Saturate. */
__asm__(
"xvcvdpsxws %x0,%x1"
: "=wa" (temp)
: "wa" (__A)
: );
#ifdef _ARCH_PWR8
temp = vec_mergeo (temp, temp);
result = (__v4si) vec_vpkudum ((__vector long long) temp,
(__vector long long) vzero);
#else
{
const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
}
#endif
return ((__m128i) result);
}
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttpd_pi32 (__m128d __A)
{
__m128i result = _mm_cvttpd_epi32 (__A);
return (__m64) result[0];
}
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi128_si32 (__m128i __A)
{
return ((__v4si)__A)[0];
}
#ifdef _ARCH_PWR8
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpi32_pd (__m64 __A)
{
__v4si temp;
__v2di tmp2;
__v2df result;
temp = (__v4si)vec_splats (__A);
tmp2 = (__v2di)vec_unpackl (temp);
result = vec_ctf ((__vector signed long long) tmp2, 0);
return (__m128d)result;
}
#endif
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtps_epi32 (__m128 __A)
{
__v4sf rounded;
__v4si result;
rounded = vec_rint((__v4sf) __A);
result = vec_cts (rounded, 0);
return (__m128i) result;
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttps_epi32 (__m128 __A)
{
__v4si result;
result = vec_cts ((__v4sf) __A, 0);
return (__m128i) result;
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtps_pd (__m128 __A)
{
/* Check if vec_doubleh is defined by <altivec.h>. If so use that. */
#ifdef vec_doubleh
return (__m128d) vec_doubleh ((__v4sf)__A);
#else
/* Otherwise the compiler is not current and so need to generate the
equivalent code. */
__v4sf a = (__v4sf)__A;
__v4sf temp;
__v2df result;
#ifdef __LITTLE_ENDIAN__
/* The input float values are in elements {[0], [1]} but the convert
instruction needs them in elements {[1], [3]}, So we use two
shift left double vector word immediates to get the elements
lined up. */
temp = __builtin_vsx_xxsldwi (a, a, 3);
temp = __builtin_vsx_xxsldwi (a, temp, 2);
#else
/* The input float values are in elements {[0], [1]} but the convert
instruction needs them in elements {[0], [2]}, So we use two
shift left double vector word immediates to get the elements
lined up. */
temp = vec_vmrghw (a, a);
#endif
__asm__(
" xvcvspdp %x0,%x1"
: "=wa" (result)
: "wa" (temp)
: );
return (__m128d) result;
#endif
}
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_si32 (__m128d __A)
{
__v2df rounded = vec_rint((__v2df) __A);
int result = ((__v2df)rounded)[0];
return result;
}
/* Intel intrinsic. */
extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_si64 (__m128d __A)
{
__v2df rounded = vec_rint ((__v2df) __A );
long long result = ((__v2df) rounded)[0];
return result;
}
/* Microsoft intrinsic. */
extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_si64x (__m128d __A)
{
return _mm_cvtsd_si64 ((__v2df)__A);
}
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttsd_si32 (__m128d __A)
{
int result = ((__v2df)__A)[0];
return result;
}
/* Intel intrinsic. */
extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttsd_si64 (__m128d __A)
{
long long result = ((__v2df)__A)[0];
return result;
}
/* Microsoft intrinsic. */
extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttsd_si64x (__m128d __A)
{
return _mm_cvttsd_si64 (__A);
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_ss (__m128 __A, __m128d __B)
{
__v4sf result = (__v4sf)__A;
#ifdef __LITTLE_ENDIAN__
__v4sf temp_s;
/* Copy double element[0] to element [1] for conversion. */
__v2df temp_b = vec_splat((__v2df)__B, 0);
/* Pre-rotate __A left 3 (logically right 1) elements. */
result = __builtin_vsx_xxsldwi (result, result, 3);
/* Convert double to single float scalar in a vector. */
__asm__(
"xscvdpsp %x0,%x1"
: "=wa" (temp_s)
: "wa" (temp_b)
: );
/* Shift the resulting scalar into vector element [0]. */
result = __builtin_vsx_xxsldwi (result, temp_s, 1);
#else
result [0] = ((__v2df)__B)[0];
#endif
return (__m128) result;
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi32_sd (__m128d __A, int __B)
{
__v2df result = (__v2df)__A;
double db = __B;
result [0] = db;
return (__m128d)result;
}
/* Intel intrinsic. */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64_sd (__m128d __A, long long __B)
{
__v2df result = (__v2df)__A;
double db = __B;
result [0] = db;
return (__m128d)result;
}
/* Microsoft intrinsic. */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64x_sd (__m128d __A, long long __B)
{
return _mm_cvtsi64_sd (__A, __B);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtss_sd (__m128d __A, __m128 __B)
{
#ifdef __LITTLE_ENDIAN__
/* Use splat to move element [0] into position for the convert. */
__v4sf temp = vec_splat ((__v4sf)__B, 0);
__v2df res;
/* Convert single float scalar to double in a vector. */
__asm__(
"xscvspdp %x0,%x1"
: "=wa" (res)
: "wa" (temp)
: );
return (__m128d) vec_mergel (res, (__v2df)__A);
#else
__v2df res = (__v2df)__A;
res [0] = ((__v4sf)__B) [0];
return (__m128d) res;
#endif
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask)
{
__vector double result;
const int litmsk = __mask & 0x3;
if (litmsk == 0)
result = vec_mergeh (__A, __B);
#if __GNUC__ < 6
else if (litmsk == 1)
result = vec_xxpermdi (__B, __A, 2);
else if (litmsk == 2)
result = vec_xxpermdi (__B, __A, 1);
#else
else if (litmsk == 1)
result = vec_xxpermdi (__A, __B, 2);
else if (litmsk == 2)
result = vec_xxpermdi (__A, __B, 1);
#endif
else
result = vec_mergel (__A, __B);
return result;
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_pd (__m128d __A, __m128d __B)
{
return (__m128d) vec_mergel ((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_pd (__m128d __A, __m128d __B)
{
return (__m128d) vec_mergeh ((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadh_pd (__m128d __A, double const *__B)
{
__v2df result = (__v2df)__A;
result [1] = *__B;
return (__m128d)result;
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadl_pd (__m128d __A, double const *__B)
{
__v2df result = (__v2df)__A;
result [0] = *__B;
return (__m128d)result;
}
#ifdef _ARCH_PWR8
/* Intrinsic functions that require PowerISA 2.07 minimum. */
/* Creates a 2-bit mask from the most significant bits of the DPFP values. */
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_pd (__m128d __A)
{
__vector unsigned long long result;
static const __vector unsigned int perm_mask =
{
#ifdef __LITTLE_ENDIAN__
0x80800040, 0x80808080, 0x80808080, 0x80808080
#else
0x80808080, 0x80808080, 0x80808080, 0x80804000
#endif
};
result = ((__vector unsigned long long)
vec_vbpermq ((__vector unsigned char) __A,
(__vector unsigned char) perm_mask));
#ifdef __LITTLE_ENDIAN__
return result[1];
#else
return result[0];
#endif
}
#endif /* _ARCH_PWR8 */
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packs_epi16 (__m128i __A, __m128i __B)
{
return (__m128i) vec_packs ((__v8hi) __A, (__v8hi)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packs_epi32 (__m128i __A, __m128i __B)
{
return (__m128i) vec_packs ((__v4si)__A, (__v4si)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packus_epi16 (__m128i __A, __m128i __B)
{
return (__m128i) vec_packsu ((__v8hi) __A, (__v8hi)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_epi8 (__m128i __A, __m128i __B)
{
return (__m128i) vec_mergel ((__v16qu)__A, (__v16qu)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_epi16 (__m128i __A, __m128i __B)
{
return (__m128i) vec_mergel ((__v8hu)__A, (__v8hu)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_epi32 (__m128i __A, __m128i __B)
{
return (__m128i) vec_mergel ((__v4su)__A, (__v4su)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_epi64 (__m128i __A, __m128i __B)
{
return (__m128i) vec_mergel ((__vector long long) __A,
(__vector long long) __B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi8 (__m128i __A, __m128i __B)
{
return (__m128i) vec_mergeh ((__v16qu)__A, (__v16qu)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi16 (__m128i __A, __m128i __B)
{
return (__m128i) vec_mergeh ((__v8hi)__A, (__v8hi)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi32 (__m128i __A, __m128i __B)
{
return (__m128i) vec_mergeh ((__v4si)__A, (__v4si)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi64 (__m128i __A, __m128i __B)
{
return (__m128i) vec_mergeh ((__vector long long) __A,
(__vector long long) __B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi8 (__m128i __A, __m128i __B)
{
return (__m128i) ((__v16qu)__A + (__v16qu)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi16 (__m128i __A, __m128i __B)
{
return (__m128i) ((__v8hu)__A + (__v8hu)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi32 (__m128i __A, __m128i __B)
{
return (__m128i) ((__v4su)__A + (__v4su)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi64 (__m128i __A, __m128i __B)
{
return (__m128i) ((__v2du)__A + (__v2du)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epi8 (__m128i __A, __m128i __B)
{
return (__m128i) vec_adds ((__v16qi)__A, (__v16qi)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epi16 (__m128i __A, __m128i __B)
{
return (__m128i) vec_adds ((__v8hi)__A, (__v8hi)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epu8 (__m128i __A, __m128i __B)
{
return (__m128i) vec_adds ((__v16qu)__A, (__v16qu)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epu16 (__m128i __A, __m128i __B)
{
return (__m128i) vec_adds ((__v8hu)__A, (__v8hu)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi8 (__m128i __A, __m128i __B)
{
return (__m128i) ((__v16qu)__A - (__v16qu)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi16 (__m128i __A, __m128i __B)
{
return (__m128i) ((__v8hu)__A - (__v8hu)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi32 (__m128i __A, __m128i __B)
{
return (__m128i) ((__v4su)__A - (__v4su)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi64 (__m128i __A, __m128i __B)
{
return (__m128i) ((__v2du)__A - (__v2du)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epi8 (__m128i __A, __m128i __B)
{
return (__m128i) vec_subs ((__v16qi)__A, (__v16qi)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epi16 (__m128i __A, __m128i __B)
{
return (__m128i) vec_subs ((__v8hi)__A, (__v8hi)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epu8 (__m128i __A, __m128i __B)
{
return (__m128i) vec_subs ((__v16qu)__A, (__v16qu)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epu16 (__m128i __A, __m128i __B)
{
return (__m128i) vec_subs ((__v8hu)__A, (__v8hu)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_madd_epi16 (__m128i __A, __m128i __B)
{
__vector signed int zero = {0, 0, 0, 0};
return (__m128i) vec_vmsumshm ((__v8hi)__A, (__v8hi)__B, zero);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhi_epi16 (__m128i __A, __m128i __B)
{
__vector signed int w0, w1;
__vector unsigned char xform1 = {
#ifdef __LITTLE_ENDIAN__
0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
#else
0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
#endif
};
w0 = vec_vmulesh ((__v8hi)__A, (__v8hi)__B);
w1 = vec_vmulosh ((__v8hi)__A, (__v8hi)__B);
return (__m128i) vec_perm (w0, w1, xform1);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mullo_epi16 (__m128i __A, __m128i __B)
{
return (__m128i) ((__v8hi)__A * (__v8hi)__B);
}
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_su32 (__m64 __A, __m64 __B)
{
unsigned int a = __A;
unsigned int b = __B;
return ((__m64)a * (__m64)b);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_epu32 (__m128i __A, __m128i __B)
{
#if __GNUC__ < 8
__v2du result;
#ifdef __LITTLE_ENDIAN__
/* VMX Vector Multiply Odd Unsigned Word. */
__asm__(
"vmulouw %0,%1,%2"
: "=v" (result)
: "v" (__A), "v" (__B)
: );
#else
/* VMX Vector Multiply Even Unsigned Word. */
__asm__(
"vmuleuw %0,%1,%2"
: "=v" (result)
: "v" (__A), "v" (__B)
: );
#endif
return (__m128i) result;
#else
return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B);
#endif
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_epi16 (__m128i __A, int __B)
{
__v8hu lshift;
__v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 };
if (__B >= 0 && __B < 16)
{
if (__builtin_constant_p(__B))
lshift = (__v8hu) vec_splat_s16(__B);
else
lshift = vec_splats ((unsigned short) __B);
result = vec_sl ((__v8hi) __A, lshift);
}
return (__m128i) result;
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_epi32 (__m128i __A, int __B)
{
__v4su lshift;
__v4si result = { 0, 0, 0, 0 };
if (__B >= 0 && __B < 32)
{
if (__builtin_constant_p(__B) && __B < 16)
lshift = (__v4su) vec_splat_s32(__B);
else
lshift = vec_splats ((unsigned int) __B);
result = vec_sl ((__v4si) __A, lshift);
}
return (__m128i) result;
}
#ifdef _ARCH_PWR8
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_epi64 (__m128i __A, int __B)
{
__v2du lshift;
__v2di result = { 0, 0 };
if (__B >= 0 && __B < 64)
{
if (__builtin_constant_p(__B) && __B < 16)
lshift = (__v2du) vec_splat_s32(__B);
else
lshift = (__v2du) vec_splats ((unsigned int) __B);
result = vec_sl ((__v2di) __A, lshift);
}
return (__m128i) result;
}
#endif
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srai_epi16 (__m128i __A, int __B)
{
__v8hu rshift = { 15, 15, 15, 15, 15, 15, 15, 15 };
__v8hi result;
if (__B < 16)
{
if (__builtin_constant_p(__B))
rshift = (__v8hu) vec_splat_s16(__B);
else
rshift = vec_splats ((unsigned short) __B);
}
result = vec_sra ((__v8hi) __A, rshift);
return (__m128i) result;
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srai_epi32 (__m128i __A, int __B)
{
__v4su rshift = { 31, 31, 31, 31 };
__v4si result;
if (__B < 32)
{
if (__builtin_constant_p(__B))
{
if (__B < 16)
rshift = (__v4su) vec_splat_s32(__B);
else
rshift = (__v4su) vec_splats((unsigned int)__B);
}
else
rshift = vec_splats ((unsigned int) __B);
}
result = vec_sra ((__v4si) __A, rshift);
return (__m128i) result;
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_bslli_si128 (__m128i __A, const int __N)
{
__v16qu result;
const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
if (__N < 16)
result = vec_sld ((__v16qu) __A, zeros, __N);
else
result = zeros;
return (__m128i) result;
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_bsrli_si128 (__m128i __A, const int __N)
{
__v16qu result;
const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
if (__N < 16)
#ifdef __LITTLE_ENDIAN__
if (__builtin_constant_p(__N))
/* Would like to use Vector Shift Left Double by Octet
Immediate here to use the immediate form and avoid
load of __N * 8 value into a separate VR. */
result = vec_sld (zeros, (__v16qu) __A, (16 - __N));
else
#endif
{
__v16qu shift = vec_splats((unsigned char)(__N*8));
#ifdef __LITTLE_ENDIAN__
result = vec_sro ((__v16qu)__A, shift);
#else
result = vec_slo ((__v16qu)__A, shift);
#endif
}
else
result = zeros;
return (__m128i) result;
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_si128 (__m128i __A, const int __N)
{
return _mm_bsrli_si128 (__A, __N);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_si128 (__m128i __A, const int _imm5)
{
__v16qu result;
const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
if (_imm5 < 16)
#ifdef __LITTLE_ENDIAN__
result = vec_sld ((__v16qu) __A, zeros, _imm5);
#else
result = vec_sld (zeros, (__v16qu) __A, (16 - _imm5));
#endif
else
result = zeros;
return (__m128i) result;
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_epi16 (__m128i __A, int __B)
{
__v8hu rshift;
__v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 };
if (__B < 16)
{
if (__builtin_constant_p(__B))
rshift = (__v8hu) vec_splat_s16(__B);
else
rshift = vec_splats ((unsigned short) __B);
result = vec_sr ((__v8hi) __A, rshift);
}
return (__m128i) result;
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_epi32 (__m128i __A, int __B)
{
__v4su rshift;
__v4si result = { 0, 0, 0, 0 };
if (__B < 32)
{
if (__builtin_constant_p(__B))
{
if (__B < 16)
rshift = (__v4su) vec_splat_s32(__B);
else
rshift = (__v4su) vec_splats((unsigned int)__B);
}
else
rshift = vec_splats ((unsigned int) __B);
result = vec_sr ((__v4si) __A, rshift);
}
return (__m128i) result;
}
#ifdef _ARCH_PWR8
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_epi64 (__m128i __A, int __B)
{
__v2du rshift;
__v2di result = { 0, 0 };
if (__B < 64)
{
if (__builtin_constant_p(__B))
{
if (__B < 16)
rshift = (__v2du) vec_splat_s32(__B);
else
rshift = (__v2du) vec_splats((unsigned long long)__B);
}
else
rshift = (__v2du) vec_splats ((unsigned int) __B);
result = vec_sr ((__v2di) __A, rshift);
}
return (__m128i) result;
}
#endif
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_epi16 (__m128i __A, __m128i __B)
{
__v8hu lshift;
__vector __bool short shmask;
const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
__v8hu result;
#ifdef __LITTLE_ENDIAN__
lshift = vec_splat ((__v8hu) __B, 0);
#else
lshift = vec_splat ((__v8hu) __B, 3);
#endif
shmask = vec_cmple (lshift, shmax);
result = vec_sl ((__v8hu) __A, lshift);
result = vec_sel ((__v8hu) shmask, result, shmask);
return (__m128i) result;
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_epi32 (__m128i __A, __m128i __B)
{
__v4su lshift;
__vector __bool int shmask;
const __v4su shmax = { 32, 32, 32, 32 };
__v4su result;
#ifdef __LITTLE_ENDIAN__
lshift = vec_splat ((__v4su) __B, 0);
#else
lshift = vec_splat ((__v4su) __B, 1);
#endif
shmask = vec_cmplt (lshift, shmax);
result = vec_sl ((__v4su) __A, lshift);
result = vec_sel ((__v4su) shmask, result, shmask);
return (__m128i) result;
}
#ifdef _ARCH_PWR8
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_epi64 (__m128i __A, __m128i __B)
{
__v2du lshift;
__vector __bool long long shmask;
const __v2du shmax = { 64, 64 };
__v2du result;
lshift = vec_splat ((__v2du) __B, 0);
shmask = vec_cmplt (lshift, shmax);
result = vec_sl ((__v2du) __A, lshift);
result = vec_sel ((__v2du) shmask, result, shmask);
return (__m128i) result;
}
#endif
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sra_epi16 (__m128i __A, __m128i __B)
{
const __v8hu rshmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
__v8hu rshift;
__v8hi result;
#ifdef __LITTLE_ENDIAN__
rshift = vec_splat ((__v8hu)__B, 0);
#else
rshift = vec_splat ((__v8hu)__B, 3);
#endif
rshift = vec_min (rshift, rshmax);
result = vec_sra ((__v8hi) __A, rshift);
return (__m128i) result;
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sra_epi32 (__m128i __A, __m128i __B)
{
const __v4su rshmax = { 31, 31, 31, 31 };
__v4su rshift;
__v4si result;
#ifdef __LITTLE_ENDIAN__
rshift = vec_splat ((__v4su)__B, 0);
#else
rshift = vec_splat ((__v4su)__B, 1);
#endif
rshift = vec_min (rshift, rshmax);
result = vec_sra ((__v4si) __A, rshift);
return (__m128i) result;
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_epi16 (__m128i __A, __m128i __B)
{
__v8hu rshift;
__vector __bool short shmask;
const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
__v8hu result;
#ifdef __LITTLE_ENDIAN__
rshift = vec_splat ((__v8hu) __B, 0);
#else
rshift = vec_splat ((__v8hu) __B, 3);
#endif
shmask = vec_cmple (rshift, shmax);
result = vec_sr ((__v8hu) __A, rshift);
result = vec_sel ((__v8hu) shmask, result, shmask);
return (__m128i) result;
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_epi32 (__m128i __A, __m128i __B)
{
__v4su rshift;
__vector __bool int shmask;
const __v4su shmax = { 32, 32, 32, 32 };
__v4su result;
#ifdef __LITTLE_ENDIAN__
rshift = vec_splat ((__v4su) __B, 0);
#else
rshift = vec_splat ((__v4su) __B, 1);
#endif
shmask = vec_cmplt (rshift, shmax);
result = vec_sr ((__v4su) __A, rshift);
result = vec_sel ((__v4su) shmask, result, shmask);
return (__m128i) result;
}
#ifdef _ARCH_PWR8
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_epi64 (__m128i __A, __m128i __B)
{
__v2du rshift;
__vector __bool long long shmask;
const __v2du shmax = { 64, 64 };
__v2du result;
rshift = vec_splat ((__v2du) __B, 0);
shmask = vec_cmplt (rshift, shmax);
result = vec_sr ((__v2du) __A, rshift);
result = vec_sel ((__v2du) shmask, result, shmask);
return (__m128i) result;
}
#endif
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_and_pd (__m128d __A, __m128d __B)
{
return (vec_and ((__v2df) __A, (__v2df) __B));
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_andnot_pd (__m128d __A, __m128d __B)
{
return (vec_andc ((__v2df) __B, (__v2df) __A));
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_or_pd (__m128d __A, __m128d __B)
{
return (vec_or ((__v2df) __A, (__v2df) __B));
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_xor_pd (__m128d __A, __m128d __B)
{
return (vec_xor ((__v2df) __A, (__v2df) __B));
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_and_si128 (__m128i __A, __m128i __B)
{
return (__m128i)vec_and ((__v2di) __A, (__v2di) __B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_andnot_si128 (__m128i __A, __m128i __B)
{
return (__m128i)vec_andc ((__v2di) __B, (__v2di) __A);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_or_si128 (__m128i __A, __m128i __B)
{
return (__m128i)vec_or ((__v2di) __A, (__v2di) __B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_xor_si128 (__m128i __A, __m128i __B)
{
return (__m128i)vec_xor ((__v2di) __A, (__v2di) __B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_epi8 (__m128i __A, __m128i __B)
{
return (__m128i) vec_cmpeq ((__v16qi) __A, (__v16qi)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_epi16 (__m128i __A, __m128i __B)
{
return (__m128i) vec_cmpeq ((__v8hi) __A, (__v8hi)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_epi32 (__m128i __A, __m128i __B)
{
return (__m128i) vec_cmpeq ((__v4si) __A, (__v4si)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_epi8 (__m128i __A, __m128i __B)
{
return (__m128i) vec_cmplt ((__v16qi) __A, (__v16qi)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_epi16 (__m128i __A, __m128i __B)
{
return (__m128i) vec_cmplt ((__v8hi) __A, (__v8hi)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_epi32 (__m128i __A, __m128i __B)
{
return (__m128i) vec_cmplt ((__v4si) __A, (__v4si)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_epi8 (__m128i __A, __m128i __B)
{
return (__m128i) vec_cmpgt ((__v16qi) __A, (__v16qi)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_epi16 (__m128i __A, __m128i __B)
{
return (__m128i) vec_cmpgt ((__v8hi) __A, (__v8hi)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_epi32 (__m128i __A, __m128i __B)
{
return (__m128i) vec_cmpgt ((__v4si) __A, (__v4si)__B);
}
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_extract_epi16 (__m128i const __A, int const __N)
{
return (unsigned short) ((__v8hi)__A)[__N & 7];
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_insert_epi16 (__m128i const __A, int const __D, int const __N)
{
__v8hi result = (__v8hi)__A;
result [(__N & 7)] = __D;
return (__m128i) result;
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_epi16 (__m128i __A, __m128i __B)
{
return (__m128i) vec_max ((__v8hi)__A, (__v8hi)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_epu8 (__m128i __A, __m128i __B)
{
return (__m128i) vec_max ((__v16qu) __A, (__v16qu)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_epi16 (__m128i __A, __m128i __B)
{
return (__m128i) vec_min ((__v8hi) __A, (__v8hi)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_epu8 (__m128i __A, __m128i __B)
{
return (__m128i) vec_min ((__v16qu) __A, (__v16qu)__B);
}
#ifdef _ARCH_PWR8
/* Intrinsic functions that require PowerISA 2.07 minimum. */
/* Creates a 4-bit mask from the most significant bits of the SPFP values. */
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_epi8 (__m128i __A)
{
__vector unsigned long long result;
static const __vector unsigned char perm_mask =
{
0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00
};
result = ((__vector unsigned long long)
vec_vbpermq ((__vector unsigned char) __A,
(__vector unsigned char) perm_mask));
#ifdef __LITTLE_ENDIAN__
return result[1];
#else
return result[0];
#endif
}
#endif /* _ARCH_PWR8 */
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhi_epu16 (__m128i __A, __m128i __B)
{
__v4su w0, w1;
__v16qu xform1 = {
#ifdef __LITTLE_ENDIAN__
0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
#else
0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
#endif
};
w0 = vec_vmuleuh ((__v8hu)__A, (__v8hu)__B);
w1 = vec_vmulouh ((__v8hu)__A, (__v8hu)__B);
return (__m128i) vec_perm (w0, w1, xform1);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shufflehi_epi16 (__m128i __A, const int __mask)
{
unsigned long element_selector_98 = __mask & 0x03;
unsigned long element_selector_BA = (__mask >> 2) & 0x03;
unsigned long element_selector_DC = (__mask >> 4) & 0x03;
unsigned long element_selector_FE = (__mask >> 6) & 0x03;
static const unsigned short permute_selectors[4] =
{
#ifdef __LITTLE_ENDIAN__
0x0908, 0x0B0A, 0x0D0C, 0x0F0E
#else
0x0809, 0x0A0B, 0x0C0D, 0x0E0F
#endif
};
__v2du pmask =
#ifdef __LITTLE_ENDIAN__
{ 0x1716151413121110UL, 0UL};
#else
{ 0x1011121314151617UL, 0UL};
#endif
__m64_union t;
__v2du a, r;
t.as_short[0] = permute_selectors[element_selector_98];
t.as_short[1] = permute_selectors[element_selector_BA];
t.as_short[2] = permute_selectors[element_selector_DC];
t.as_short[3] = permute_selectors[element_selector_FE];
pmask[1] = t.as_m64;
a = (__v2du)__A;
r = vec_perm (a, a, (__vector unsigned char)pmask);
return (__m128i) r;
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shufflelo_epi16 (__m128i __A, const int __mask)
{
unsigned long element_selector_10 = __mask & 0x03;
unsigned long element_selector_32 = (__mask >> 2) & 0x03;
unsigned long element_selector_54 = (__mask >> 4) & 0x03;
unsigned long element_selector_76 = (__mask >> 6) & 0x03;
static const unsigned short permute_selectors[4] =
{
#ifdef __LITTLE_ENDIAN__
0x0100, 0x0302, 0x0504, 0x0706
#else
0x0001, 0x0203, 0x0405, 0x0607
#endif
};
__v2du pmask =
#ifdef __LITTLE_ENDIAN__
{ 0UL, 0x1f1e1d1c1b1a1918UL};
#else
{ 0UL, 0x18191a1b1c1d1e1fUL};
#endif
__m64_union t;
__v2du a, r;
t.as_short[0] = permute_selectors[element_selector_10];
t.as_short[1] = permute_selectors[element_selector_32];
t.as_short[2] = permute_selectors[element_selector_54];
t.as_short[3] = permute_selectors[element_selector_76];
pmask[0] = t.as_m64;
a = (__v2du)__A;
r = vec_perm (a, a, (__vector unsigned char)pmask);
return (__m128i) r;
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_epi32 (__m128i __A, const int __mask)
{
unsigned long element_selector_10 = __mask & 0x03;
unsigned long element_selector_32 = (__mask >> 2) & 0x03;
unsigned long element_selector_54 = (__mask >> 4) & 0x03;
unsigned long element_selector_76 = (__mask >> 6) & 0x03;
static const unsigned int permute_selectors[4] =
{
#ifdef __LITTLE_ENDIAN__
0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
#else
0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
#endif
};
__v4su t;
t[0] = permute_selectors[element_selector_10];
t[1] = permute_selectors[element_selector_32];
t[2] = permute_selectors[element_selector_54] + 0x10101010;
t[3] = permute_selectors[element_selector_76] + 0x10101010;
return (__m128i)vec_perm ((__v4si) __A, (__v4si)__A, (__vector unsigned char)t);
}
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C)
{
__v2du hibit = { 0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL};
__v16qu mask, tmp;
__m128i_u *p = (__m128i_u*)__C;
tmp = (__v16qu)_mm_loadu_si128(p);
mask = (__v16qu)vec_cmpgt ((__v16qu)__B, (__v16qu)hibit);
tmp = vec_sel (tmp, (__v16qu)__A, mask);
_mm_storeu_si128 (p, (__m128i)tmp);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_avg_epu8 (__m128i __A, __m128i __B)
{
return (__m128i) vec_avg ((__v16qu)__A, (__v16qu)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_avg_epu16 (__m128i __A, __m128i __B)
{
return (__m128i) vec_avg ((__v8hu)__A, (__v8hu)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sad_epu8 (__m128i __A, __m128i __B)
{
__v16qu a, b;
__v16qu vmin, vmax, vabsdiff;
__v4si vsum;
const __v4su zero = { 0, 0, 0, 0 };
__v4si result;
a = (__v16qu) __A;
b = (__v16qu) __B;
vmin = vec_min (a, b);
vmax = vec_max (a, b);
vabsdiff = vec_sub (vmax, vmin);
/* Sum four groups of bytes into integers. */
vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
/* Sum across four integers with two integer results. */
result = vec_sum2s (vsum, (__vector signed int) zero);
/* Rotate the sums into the correct position. */
#ifdef __LITTLE_ENDIAN__
result = vec_sld (result, result, 4);
#else
result = vec_sld (result, result, 6);
#endif
/* Rotate the sums into the correct position. */
return (__m128i) result;
}
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_si32 (int *__A, int __B)
{
/* Use the data cache block touch for store transient. */
__asm__ (
"dcbtstt 0,%0"
:
: "b" (__A)
: "memory"
);
*__A = __B;
}
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_si64 (long long int *__A, long long int __B)
{
/* Use the data cache block touch for store transient. */
__asm__ (
" dcbtstt 0,%0"
:
: "b" (__A)
: "memory"
);
*__A = __B;
}
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_si128 (__m128i *__A, __m128i __B)
{
/* Use the data cache block touch for store transient. */
__asm__ (
"dcbtstt 0,%0"
:
: "b" (__A)
: "memory"
);
*__A = __B;
}
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_pd (double *__A, __m128d __B)
{
/* Use the data cache block touch for store transient. */
__asm__ (
"dcbtstt 0,%0"
:
: "b" (__A)
: "memory"
);
*(__m128d*)__A = __B;
}
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_clflush (void const *__A)
{
/* Use the data cache block flush. */
__asm__ (
"dcbf 0,%0"
:
: "b" (__A)
: "memory"
);
}
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_lfence (void)
{
/* Use light weight sync for load to load ordering. */
__atomic_thread_fence (__ATOMIC_RELEASE);
}
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mfence (void)
{
/* Use heavy weight sync for any to any ordering. */
__atomic_thread_fence (__ATOMIC_SEQ_CST);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi32_si128 (int __A)
{
return _mm_set_epi32 (0, 0, 0, __A);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64_si128 (long long __A)
{
return __extension__ (__m128i)(__v2di){ __A, 0LL };
}
/* Microsoft intrinsic. */
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64x_si128 (long long __A)
{
return __extension__ (__m128i)(__v2di){ __A, 0LL };
}
/* Casts between various SP, DP, INT vector types. Note that these do no
conversion of values, they just change the type. */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castpd_ps(__m128d __A)
{
return (__m128) __A;
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castpd_si128(__m128d __A)
{
return (__m128i) __A;
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castps_pd(__m128 __A)
{
return (__m128d) __A;
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castps_si128(__m128 __A)
{
return (__m128i) __A;
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castsi128_ps(__m128i __A)
{
return (__m128) __A;
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castsi128_pd(__m128i __A)
{
return (__m128d) __A;
}
#endif /* EMMINTRIN_H_ */
+44
View File
@@ -0,0 +1,44 @@
/*===---- mm_malloc.h - Implementation of _mm_malloc and _mm_free ----------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef _MM_MALLOC_H_INCLUDED
#define _MM_MALLOC_H_INCLUDED
#include <stdlib.h>
/* We can't depend on <stdlib.h> since the prototype of posix_memalign
may not be visible. */
#ifndef __cplusplus
extern int posix_memalign (void **, size_t, size_t);
#else
extern "C" int posix_memalign (void **, size_t, size_t) throw ();
#endif
static __inline void *
_mm_malloc (size_t size, size_t alignment)
{
/* PowerPC64 ELF V2 ABI requires quadword alignment. */
size_t vec_align = sizeof (__vector float);
void *ptr;
if (alignment < vec_align)
alignment = vec_align;
if (posix_memalign (&ptr, alignment, size) == 0)
return ptr;
else
return NULL;
}
static __inline void
_mm_free (void * ptr)
{
free (ptr);
}
#endif /* _MM_MALLOC_H_INCLUDED */
+1443
View File
@@ -0,0 +1,1443 @@
/*===---- mmintrin.h - Implementation of MMX intrinsics on PowerPC ---------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
/* Implemented from the specification included in the Intel C++ Compiler
User Guide and Reference, version 9.0. */
#ifndef NO_WARN_X86_INTRINSICS
/* This header file is to help porting code using Intel intrinsics
explicitly from x86_64 to powerpc64/powerpc64le.
Since PowerPC target doesn't support native 64-bit vector type, we
typedef __m64 to 64-bit unsigned long long in MMX intrinsics, which
works well for _si64 and some _pi32 operations.
For _pi16 and _pi8 operations, it's better to transfer __m64 into
128-bit PowerPC vector first. Power8 introduced direct register
move instructions which helps for more efficient implementation.
It's user's responsibility to determine if the results of such port
are acceptable or further changes are needed. Please note that much
code using Intel intrinsics CAN BE REWRITTEN in more portable and
efficient standard C or GNU C extensions with 64-bit scalar
operations, or 128-bit SSE/Altivec operations, which are more
recommended. */
#error \
"Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
#endif
#ifndef _MMINTRIN_H_INCLUDED
#define _MMINTRIN_H_INCLUDED
#include <altivec.h>
/* The Intel API is flexible enough that we must allow aliasing with other
vector types, and their scalar components. */
typedef __attribute__((__aligned__(8))) unsigned long long __m64;
typedef __attribute__((__aligned__(8))) union {
__m64 as_m64;
char as_char[8];
signed char as_signed_char[8];
short as_short[4];
int as_int[2];
long long as_long_long;
float as_float[2];
double as_double;
} __m64_union;
/* Empty the multimedia state. */
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_empty(void) {
/* nothing to do on PowerPC. */
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_empty(void) {
/* nothing to do on PowerPC. */
}
/* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi32_si64(int __i) {
return (__m64)(unsigned int)__i;
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_from_int(int __i) {
return _mm_cvtsi32_si64(__i);
}
/* Convert the lower 32 bits of the __m64 object into an integer. */
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64_si32(__m64 __i) {
return ((int)__i);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_to_int(__m64 __i) {
return _mm_cvtsi64_si32(__i);
}
/* Convert I to a __m64 object. */
/* Intel intrinsic. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_from_int64(long long __i) {
return (__m64)__i;
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64_m64(long long __i) {
return (__m64)__i;
}
/* Microsoft intrinsic. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64x_si64(long long __i) {
return (__m64)__i;
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_pi64x(long long __i) {
return (__m64)__i;
}
/* Convert the __m64 object to a 64bit integer. */
/* Intel intrinsic. */
extern __inline long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_to_int64(__m64 __i) {
return (long long)__i;
}
extern __inline long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtm64_si64(__m64 __i) {
return (long long)__i;
}
/* Microsoft intrinsic. */
extern __inline long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64_si64x(__m64 __i) {
return (long long)__i;
}
#ifdef _ARCH_PWR8
/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
the result, and the four 16-bit values from M2 into the upper four 8-bit
values of the result, all with signed saturation. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packs_pi16(__m64 __m1, __m64 __m2) {
__vector signed short vm1;
__vector signed char vresult;
vm1 = (__vector signed short)(__vector unsigned long long)
#ifdef __LITTLE_ENDIAN__
{__m1, __m2};
#else
{__m2, __m1};
#endif
vresult = vec_packs(vm1, vm1);
return (__m64)((__vector long long)vresult)[0];
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_packsswb(__m64 __m1, __m64 __m2) {
return _mm_packs_pi16(__m1, __m2);
}
/* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
the result, and the two 32-bit values from M2 into the upper two 16-bit
values of the result, all with signed saturation. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packs_pi32(__m64 __m1, __m64 __m2) {
__vector signed int vm1;
__vector signed short vresult;
vm1 = (__vector signed int)(__vector unsigned long long)
#ifdef __LITTLE_ENDIAN__
{__m1, __m2};
#else
{__m2, __m1};
#endif
vresult = vec_packs(vm1, vm1);
return (__m64)((__vector long long)vresult)[0];
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_packssdw(__m64 __m1, __m64 __m2) {
return _mm_packs_pi32(__m1, __m2);
}
/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
the result, and the four 16-bit values from M2 into the upper four 8-bit
values of the result, all with unsigned saturation. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packs_pu16(__m64 __m1, __m64 __m2) {
__vector unsigned char r;
__vector signed short vm1 = (__vector signed short)(__vector long long)
#ifdef __LITTLE_ENDIAN__
{__m1, __m2};
#else
{__m2, __m1};
#endif
const __vector signed short __zero = {0};
__vector __bool short __select = vec_cmplt(vm1, __zero);
r = vec_packs((__vector unsigned short)vm1, (__vector unsigned short)vm1);
__vector __bool char packsel = vec_pack(__select, __select);
r = vec_sel(r, (const __vector unsigned char)__zero, packsel);
return (__m64)((__vector long long)r)[0];
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_packuswb(__m64 __m1, __m64 __m2) {
return _mm_packs_pu16(__m1, __m2);
}
#endif /* end ARCH_PWR8 */
/* Interleave the four 8-bit values from the high half of M1 with the four
8-bit values from the high half of M2. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_pi8(__m64 __m1, __m64 __m2) {
#if _ARCH_PWR8
__vector unsigned char a, b, c;
a = (__vector unsigned char)vec_splats(__m1);
b = (__vector unsigned char)vec_splats(__m2);
c = vec_mergel(a, b);
return (__m64)((__vector long long)c)[1];
#else
__m64_union m1, m2, res;
m1.as_m64 = __m1;
m2.as_m64 = __m2;
res.as_char[0] = m1.as_char[4];
res.as_char[1] = m2.as_char[4];
res.as_char[2] = m1.as_char[5];
res.as_char[3] = m2.as_char[5];
res.as_char[4] = m1.as_char[6];
res.as_char[5] = m2.as_char[6];
res.as_char[6] = m1.as_char[7];
res.as_char[7] = m2.as_char[7];
return (__m64)res.as_m64;
#endif
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_punpckhbw(__m64 __m1, __m64 __m2) {
return _mm_unpackhi_pi8(__m1, __m2);
}
/* Interleave the two 16-bit values from the high half of M1 with the two
16-bit values from the high half of M2. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_pi16(__m64 __m1, __m64 __m2) {
__m64_union m1, m2, res;
m1.as_m64 = __m1;
m2.as_m64 = __m2;
res.as_short[0] = m1.as_short[2];
res.as_short[1] = m2.as_short[2];
res.as_short[2] = m1.as_short[3];
res.as_short[3] = m2.as_short[3];
return (__m64)res.as_m64;
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_punpckhwd(__m64 __m1, __m64 __m2) {
return _mm_unpackhi_pi16(__m1, __m2);
}
/* Interleave the 32-bit value from the high half of M1 with the 32-bit
value from the high half of M2. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_pi32(__m64 __m1, __m64 __m2) {
__m64_union m1, m2, res;
m1.as_m64 = __m1;
m2.as_m64 = __m2;
res.as_int[0] = m1.as_int[1];
res.as_int[1] = m2.as_int[1];
return (__m64)res.as_m64;
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_punpckhdq(__m64 __m1, __m64 __m2) {
return _mm_unpackhi_pi32(__m1, __m2);
}
/* Interleave the four 8-bit values from the low half of M1 with the four
8-bit values from the low half of M2. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_pi8(__m64 __m1, __m64 __m2) {
#if _ARCH_PWR8
__vector unsigned char a, b, c;
a = (__vector unsigned char)vec_splats(__m1);
b = (__vector unsigned char)vec_splats(__m2);
c = vec_mergel(a, b);
return (__m64)((__vector long long)c)[0];
#else
__m64_union m1, m2, res;
m1.as_m64 = __m1;
m2.as_m64 = __m2;
res.as_char[0] = m1.as_char[0];
res.as_char[1] = m2.as_char[0];
res.as_char[2] = m1.as_char[1];
res.as_char[3] = m2.as_char[1];
res.as_char[4] = m1.as_char[2];
res.as_char[5] = m2.as_char[2];
res.as_char[6] = m1.as_char[3];
res.as_char[7] = m2.as_char[3];
return (__m64)res.as_m64;
#endif
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_punpcklbw(__m64 __m1, __m64 __m2) {
return _mm_unpacklo_pi8(__m1, __m2);
}
/* Interleave the two 16-bit values from the low half of M1 with the two
16-bit values from the low half of M2. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_pi16(__m64 __m1, __m64 __m2) {
__m64_union m1, m2, res;
m1.as_m64 = __m1;
m2.as_m64 = __m2;
res.as_short[0] = m1.as_short[0];
res.as_short[1] = m2.as_short[0];
res.as_short[2] = m1.as_short[1];
res.as_short[3] = m2.as_short[1];
return (__m64)res.as_m64;
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_punpcklwd(__m64 __m1, __m64 __m2) {
return _mm_unpacklo_pi16(__m1, __m2);
}
/* Interleave the 32-bit value from the low half of M1 with the 32-bit
value from the low half of M2. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_pi32(__m64 __m1, __m64 __m2) {
__m64_union m1, m2, res;
m1.as_m64 = __m1;
m2.as_m64 = __m2;
res.as_int[0] = m1.as_int[0];
res.as_int[1] = m2.as_int[0];
return (__m64)res.as_m64;
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_punpckldq(__m64 __m1, __m64 __m2) {
return _mm_unpacklo_pi32(__m1, __m2);
}
/* Add the 8-bit values in M1 to the 8-bit values in M2. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_pi8(__m64 __m1, __m64 __m2) {
#if _ARCH_PWR8
__vector signed char a, b, c;
a = (__vector signed char)vec_splats(__m1);
b = (__vector signed char)vec_splats(__m2);
c = vec_add(a, b);
return (__m64)((__vector long long)c)[0];
#else
__m64_union m1, m2, res;
m1.as_m64 = __m1;
m2.as_m64 = __m2;
res.as_char[0] = m1.as_char[0] + m2.as_char[0];
res.as_char[1] = m1.as_char[1] + m2.as_char[1];
res.as_char[2] = m1.as_char[2] + m2.as_char[2];
res.as_char[3] = m1.as_char[3] + m2.as_char[3];
res.as_char[4] = m1.as_char[4] + m2.as_char[4];
res.as_char[5] = m1.as_char[5] + m2.as_char[5];
res.as_char[6] = m1.as_char[6] + m2.as_char[6];
res.as_char[7] = m1.as_char[7] + m2.as_char[7];
return (__m64)res.as_m64;
#endif
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_paddb(__m64 __m1, __m64 __m2) {
return _mm_add_pi8(__m1, __m2);
}
/* Add the 16-bit values in M1 to the 16-bit values in M2. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_pi16(__m64 __m1, __m64 __m2) {
#if _ARCH_PWR8
__vector signed short a, b, c;
a = (__vector signed short)vec_splats(__m1);
b = (__vector signed short)vec_splats(__m2);
c = vec_add(a, b);
return (__m64)((__vector long long)c)[0];
#else
__m64_union m1, m2, res;
m1.as_m64 = __m1;
m2.as_m64 = __m2;
res.as_short[0] = m1.as_short[0] + m2.as_short[0];
res.as_short[1] = m1.as_short[1] + m2.as_short[1];
res.as_short[2] = m1.as_short[2] + m2.as_short[2];
res.as_short[3] = m1.as_short[3] + m2.as_short[3];
return (__m64)res.as_m64;
#endif
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_paddw(__m64 __m1, __m64 __m2) {
return _mm_add_pi16(__m1, __m2);
}
/* Add the 32-bit values in M1 to the 32-bit values in M2. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_pi32(__m64 __m1, __m64 __m2) {
#if _ARCH_PWR9
__vector signed int a, b, c;
a = (__vector signed int)vec_splats(__m1);
b = (__vector signed int)vec_splats(__m2);
c = vec_add(a, b);
return (__m64)((__vector long long)c)[0];
#else
__m64_union m1, m2, res;
m1.as_m64 = __m1;
m2.as_m64 = __m2;
res.as_int[0] = m1.as_int[0] + m2.as_int[0];
res.as_int[1] = m1.as_int[1] + m2.as_int[1];
return (__m64)res.as_m64;
#endif
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_paddd(__m64 __m1, __m64 __m2) {
return _mm_add_pi32(__m1, __m2);
}
/* Subtract the 8-bit values in M2 from the 8-bit values in M1. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_pi8(__m64 __m1, __m64 __m2) {
#if _ARCH_PWR8
__vector signed char a, b, c;
a = (__vector signed char)vec_splats(__m1);
b = (__vector signed char)vec_splats(__m2);
c = vec_sub(a, b);
return (__m64)((__vector long long)c)[0];
#else
__m64_union m1, m2, res;
m1.as_m64 = __m1;
m2.as_m64 = __m2;
res.as_char[0] = m1.as_char[0] - m2.as_char[0];
res.as_char[1] = m1.as_char[1] - m2.as_char[1];
res.as_char[2] = m1.as_char[2] - m2.as_char[2];
res.as_char[3] = m1.as_char[3] - m2.as_char[3];
res.as_char[4] = m1.as_char[4] - m2.as_char[4];
res.as_char[5] = m1.as_char[5] - m2.as_char[5];
res.as_char[6] = m1.as_char[6] - m2.as_char[6];
res.as_char[7] = m1.as_char[7] - m2.as_char[7];
return (__m64)res.as_m64;
#endif
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psubb(__m64 __m1, __m64 __m2) {
return _mm_sub_pi8(__m1, __m2);
}
/* Subtract the 16-bit values in M2 from the 16-bit values in M1. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_pi16(__m64 __m1, __m64 __m2) {
#if _ARCH_PWR8
__vector signed short a, b, c;
a = (__vector signed short)vec_splats(__m1);
b = (__vector signed short)vec_splats(__m2);
c = vec_sub(a, b);
return (__m64)((__vector long long)c)[0];
#else
__m64_union m1, m2, res;
m1.as_m64 = __m1;
m2.as_m64 = __m2;
res.as_short[0] = m1.as_short[0] - m2.as_short[0];
res.as_short[1] = m1.as_short[1] - m2.as_short[1];
res.as_short[2] = m1.as_short[2] - m2.as_short[2];
res.as_short[3] = m1.as_short[3] - m2.as_short[3];
return (__m64)res.as_m64;
#endif
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psubw(__m64 __m1, __m64 __m2) {
return _mm_sub_pi16(__m1, __m2);
}
/* Subtract the 32-bit values in M2 from the 32-bit values in M1. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_pi32(__m64 __m1, __m64 __m2) {
#if _ARCH_PWR9
__vector signed int a, b, c;
a = (__vector signed int)vec_splats(__m1);
b = (__vector signed int)vec_splats(__m2);
c = vec_sub(a, b);
return (__m64)((__vector long long)c)[0];
#else
__m64_union m1, m2, res;
m1.as_m64 = __m1;
m2.as_m64 = __m2;
res.as_int[0] = m1.as_int[0] - m2.as_int[0];
res.as_int[1] = m1.as_int[1] - m2.as_int[1];
return (__m64)res.as_m64;
#endif
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psubd(__m64 __m1, __m64 __m2) {
return _mm_sub_pi32(__m1, __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_si64(__m64 __m1, __m64 __m2) {
return (__m1 + __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_si64(__m64 __m1, __m64 __m2) {
return (__m1 - __m2);
}
/* Shift the 64-bit value in M left by COUNT. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_si64(__m64 __m, __m64 __count) {
return (__m << __count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psllq(__m64 __m, __m64 __count) {
return _mm_sll_si64(__m, __count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_si64(__m64 __m, const int __count) {
return (__m << __count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psllqi(__m64 __m, const int __count) {
return _mm_slli_si64(__m, __count);
}
/* Shift the 64-bit value in M left by COUNT; shift in zeros. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_si64(__m64 __m, __m64 __count) {
return (__m >> __count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psrlq(__m64 __m, __m64 __count) {
return _mm_srl_si64(__m, __count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_si64(__m64 __m, const int __count) {
return (__m >> __count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psrlqi(__m64 __m, const int __count) {
return _mm_srli_si64(__m, __count);
}
/* Bit-wise AND the 64-bit values in M1 and M2. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_and_si64(__m64 __m1, __m64 __m2) {
return (__m1 & __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pand(__m64 __m1, __m64 __m2) {
return _mm_and_si64(__m1, __m2);
}
/* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
64-bit value in M2. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_andnot_si64(__m64 __m1, __m64 __m2) {
return (~__m1 & __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pandn(__m64 __m1, __m64 __m2) {
return _mm_andnot_si64(__m1, __m2);
}
/* Bit-wise inclusive OR the 64-bit values in M1 and M2. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_or_si64(__m64 __m1, __m64 __m2) {
return (__m1 | __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_por(__m64 __m1, __m64 __m2) {
return _mm_or_si64(__m1, __m2);
}
/* Bit-wise exclusive OR the 64-bit values in M1 and M2. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_xor_si64(__m64 __m1, __m64 __m2) {
return (__m1 ^ __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pxor(__m64 __m1, __m64 __m2) {
return _mm_xor_si64(__m1, __m2);
}
/* Creates a 64-bit zero. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setzero_si64(void) {
return (__m64)0;
}
/* Compare eight 8-bit values. The result of the comparison is 0xFF if the
test is true and zero if false. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_pi8(__m64 __m1, __m64 __m2) {
#if defined(_ARCH_PWR6) && defined(__powerpc64__)
__m64 res;
__asm__("cmpb %0,%1,%2;\n" : "=r"(res) : "r"(__m1), "r"(__m2) :);
return (res);
#else
__m64_union m1, m2, res;
m1.as_m64 = __m1;
m2.as_m64 = __m2;
res.as_char[0] = (m1.as_char[0] == m2.as_char[0]) ? -1 : 0;
res.as_char[1] = (m1.as_char[1] == m2.as_char[1]) ? -1 : 0;
res.as_char[2] = (m1.as_char[2] == m2.as_char[2]) ? -1 : 0;
res.as_char[3] = (m1.as_char[3] == m2.as_char[3]) ? -1 : 0;
res.as_char[4] = (m1.as_char[4] == m2.as_char[4]) ? -1 : 0;
res.as_char[5] = (m1.as_char[5] == m2.as_char[5]) ? -1 : 0;
res.as_char[6] = (m1.as_char[6] == m2.as_char[6]) ? -1 : 0;
res.as_char[7] = (m1.as_char[7] == m2.as_char[7]) ? -1 : 0;
return (__m64)res.as_m64;
#endif
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pcmpeqb(__m64 __m1, __m64 __m2) {
return _mm_cmpeq_pi8(__m1, __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_pi8(__m64 __m1, __m64 __m2) {
#if _ARCH_PWR8
__vector signed char a, b, c;
a = (__vector signed char)vec_splats(__m1);
b = (__vector signed char)vec_splats(__m2);
c = (__vector signed char)vec_cmpgt(a, b);
return (__m64)((__vector long long)c)[0];
#else
__m64_union m1, m2, res;
m1.as_m64 = __m1;
m2.as_m64 = __m2;
res.as_char[0] = (m1.as_char[0] > m2.as_char[0]) ? -1 : 0;
res.as_char[1] = (m1.as_char[1] > m2.as_char[1]) ? -1 : 0;
res.as_char[2] = (m1.as_char[2] > m2.as_char[2]) ? -1 : 0;
res.as_char[3] = (m1.as_char[3] > m2.as_char[3]) ? -1 : 0;
res.as_char[4] = (m1.as_char[4] > m2.as_char[4]) ? -1 : 0;
res.as_char[5] = (m1.as_char[5] > m2.as_char[5]) ? -1 : 0;
res.as_char[6] = (m1.as_char[6] > m2.as_char[6]) ? -1 : 0;
res.as_char[7] = (m1.as_char[7] > m2.as_char[7]) ? -1 : 0;
return (__m64)res.as_m64;
#endif
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pcmpgtb(__m64 __m1, __m64 __m2) {
return _mm_cmpgt_pi8(__m1, __m2);
}
/* Compare four 16-bit values. The result of the comparison is 0xFFFF if
the test is true and zero if false. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_pi16(__m64 __m1, __m64 __m2) {
#if _ARCH_PWR8
__vector signed short a, b, c;
a = (__vector signed short)vec_splats(__m1);
b = (__vector signed short)vec_splats(__m2);
c = (__vector signed short)vec_cmpeq(a, b);
return (__m64)((__vector long long)c)[0];
#else
__m64_union m1, m2, res;
m1.as_m64 = __m1;
m2.as_m64 = __m2;
res.as_short[0] = (m1.as_short[0] == m2.as_short[0]) ? -1 : 0;
res.as_short[1] = (m1.as_short[1] == m2.as_short[1]) ? -1 : 0;
res.as_short[2] = (m1.as_short[2] == m2.as_short[2]) ? -1 : 0;
res.as_short[3] = (m1.as_short[3] == m2.as_short[3]) ? -1 : 0;
return (__m64)res.as_m64;
#endif
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pcmpeqw(__m64 __m1, __m64 __m2) {
return _mm_cmpeq_pi16(__m1, __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_pi16(__m64 __m1, __m64 __m2) {
#if _ARCH_PWR8
__vector signed short a, b, c;
a = (__vector signed short)vec_splats(__m1);
b = (__vector signed short)vec_splats(__m2);
c = (__vector signed short)vec_cmpgt(a, b);
return (__m64)((__vector long long)c)[0];
#else
__m64_union m1, m2, res;
m1.as_m64 = __m1;
m2.as_m64 = __m2;
res.as_short[0] = (m1.as_short[0] > m2.as_short[0]) ? -1 : 0;
res.as_short[1] = (m1.as_short[1] > m2.as_short[1]) ? -1 : 0;
res.as_short[2] = (m1.as_short[2] > m2.as_short[2]) ? -1 : 0;
res.as_short[3] = (m1.as_short[3] > m2.as_short[3]) ? -1 : 0;
return (__m64)res.as_m64;
#endif
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pcmpgtw(__m64 __m1, __m64 __m2) {
return _mm_cmpgt_pi16(__m1, __m2);
}
/* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if
the test is true and zero if false. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_pi32(__m64 __m1, __m64 __m2) {
#if _ARCH_PWR9
__vector signed int a, b, c;
a = (__vector signed int)vec_splats(__m1);
b = (__vector signed int)vec_splats(__m2);
c = (__vector signed int)vec_cmpeq(a, b);
return (__m64)((__vector long long)c)[0];
#else
__m64_union m1, m2, res;
m1.as_m64 = __m1;
m2.as_m64 = __m2;
res.as_int[0] = (m1.as_int[0] == m2.as_int[0]) ? -1 : 0;
res.as_int[1] = (m1.as_int[1] == m2.as_int[1]) ? -1 : 0;
return (__m64)res.as_m64;
#endif
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pcmpeqd(__m64 __m1, __m64 __m2) {
return _mm_cmpeq_pi32(__m1, __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_pi32(__m64 __m1, __m64 __m2) {
#if _ARCH_PWR9
__vector signed int a, b, c;
a = (__vector signed int)vec_splats(__m1);
b = (__vector signed int)vec_splats(__m2);
c = (__vector signed int)vec_cmpgt(a, b);
return (__m64)((__vector long long)c)[0];
#else
__m64_union m1, m2, res;
m1.as_m64 = __m1;
m2.as_m64 = __m2;
res.as_int[0] = (m1.as_int[0] > m2.as_int[0]) ? -1 : 0;
res.as_int[1] = (m1.as_int[1] > m2.as_int[1]) ? -1 : 0;
return (__m64)res.as_m64;
#endif
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pcmpgtd(__m64 __m1, __m64 __m2) {
return _mm_cmpgt_pi32(__m1, __m2);
}
#if _ARCH_PWR8
/* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
saturated arithmetic. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_pi8(__m64 __m1, __m64 __m2) {
__vector signed char a, b, c;
a = (__vector signed char)vec_splats(__m1);
b = (__vector signed char)vec_splats(__m2);
c = vec_adds(a, b);
return (__m64)((__vector long long)c)[0];
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_paddsb(__m64 __m1, __m64 __m2) {
return _mm_adds_pi8(__m1, __m2);
}
/* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
saturated arithmetic. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_pi16(__m64 __m1, __m64 __m2) {
__vector signed short a, b, c;
a = (__vector signed short)vec_splats(__m1);
b = (__vector signed short)vec_splats(__m2);
c = vec_adds(a, b);
return (__m64)((__vector long long)c)[0];
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_paddsw(__m64 __m1, __m64 __m2) {
return _mm_adds_pi16(__m1, __m2);
}
/* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
saturated arithmetic. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_pu8(__m64 __m1, __m64 __m2) {
__vector unsigned char a, b, c;
a = (__vector unsigned char)vec_splats(__m1);
b = (__vector unsigned char)vec_splats(__m2);
c = vec_adds(a, b);
return (__m64)((__vector long long)c)[0];
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_paddusb(__m64 __m1, __m64 __m2) {
return _mm_adds_pu8(__m1, __m2);
}
/* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
saturated arithmetic. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_pu16(__m64 __m1, __m64 __m2) {
__vector unsigned short a, b, c;
a = (__vector unsigned short)vec_splats(__m1);
b = (__vector unsigned short)vec_splats(__m2);
c = vec_adds(a, b);
return (__m64)((__vector long long)c)[0];
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_paddusw(__m64 __m1, __m64 __m2) {
return _mm_adds_pu16(__m1, __m2);
}
/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
saturating arithmetic. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_pi8(__m64 __m1, __m64 __m2) {
__vector signed char a, b, c;
a = (__vector signed char)vec_splats(__m1);
b = (__vector signed char)vec_splats(__m2);
c = vec_subs(a, b);
return (__m64)((__vector long long)c)[0];
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psubsb(__m64 __m1, __m64 __m2) {
return _mm_subs_pi8(__m1, __m2);
}
/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
signed saturating arithmetic. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_pi16(__m64 __m1, __m64 __m2) {
__vector signed short a, b, c;
a = (__vector signed short)vec_splats(__m1);
b = (__vector signed short)vec_splats(__m2);
c = vec_subs(a, b);
return (__m64)((__vector long long)c)[0];
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psubsw(__m64 __m1, __m64 __m2) {
return _mm_subs_pi16(__m1, __m2);
}
/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
unsigned saturating arithmetic. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_pu8(__m64 __m1, __m64 __m2) {
__vector unsigned char a, b, c;
a = (__vector unsigned char)vec_splats(__m1);
b = (__vector unsigned char)vec_splats(__m2);
c = vec_subs(a, b);
return (__m64)((__vector long long)c)[0];
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psubusb(__m64 __m1, __m64 __m2) {
return _mm_subs_pu8(__m1, __m2);
}
/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
unsigned saturating arithmetic. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_pu16(__m64 __m1, __m64 __m2) {
__vector unsigned short a, b, c;
a = (__vector unsigned short)vec_splats(__m1);
b = (__vector unsigned short)vec_splats(__m2);
c = vec_subs(a, b);
return (__m64)((__vector long long)c)[0];
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psubusw(__m64 __m1, __m64 __m2) {
return _mm_subs_pu16(__m1, __m2);
}
/* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
four 32-bit intermediate results, which are then summed by pairs to
produce two 32-bit results. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_madd_pi16(__m64 __m1, __m64 __m2) {
__vector signed short a, b;
__vector signed int c;
__vector signed int zero = {0, 0, 0, 0};
a = (__vector signed short)vec_splats(__m1);
b = (__vector signed short)vec_splats(__m2);
c = vec_vmsumshm(a, b, zero);
return (__m64)((__vector long long)c)[0];
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pmaddwd(__m64 __m1, __m64 __m2) {
return _mm_madd_pi16(__m1, __m2);
}
/* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
M2 and produce the high 16 bits of the 32-bit results. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhi_pi16(__m64 __m1, __m64 __m2) {
__vector signed short a, b;
__vector signed short c;
__vector signed int w0, w1;
__vector unsigned char xform1 = {
#ifdef __LITTLE_ENDIAN__
0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
#else
0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00,
0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
#endif
};
a = (__vector signed short)vec_splats(__m1);
b = (__vector signed short)vec_splats(__m2);
w0 = vec_vmulesh(a, b);
w1 = vec_vmulosh(a, b);
c = (__vector signed short)vec_perm(w0, w1, xform1);
return (__m64)((__vector long long)c)[0];
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pmulhw(__m64 __m1, __m64 __m2) {
return _mm_mulhi_pi16(__m1, __m2);
}
/* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
the low 16 bits of the results. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mullo_pi16(__m64 __m1, __m64 __m2) {
__vector signed short a, b, c;
a = (__vector signed short)vec_splats(__m1);
b = (__vector signed short)vec_splats(__m2);
c = a * b;
return (__m64)((__vector long long)c)[0];
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pmullw(__m64 __m1, __m64 __m2) {
return _mm_mullo_pi16(__m1, __m2);
}
/* Shift four 16-bit values in M left by COUNT. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_pi16(__m64 __m, __m64 __count) {
__vector signed short m, r;
__vector unsigned short c;
if (__count <= 15) {
m = (__vector signed short)vec_splats(__m);
c = (__vector unsigned short)vec_splats((unsigned short)__count);
r = vec_sl(m, (__vector unsigned short)c);
return (__m64)((__vector long long)r)[0];
} else
return (0);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psllw(__m64 __m, __m64 __count) {
return _mm_sll_pi16(__m, __count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_pi16(__m64 __m, int __count) {
/* Promote int to long then invoke mm_sll_pi16. */
return _mm_sll_pi16(__m, __count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psllwi(__m64 __m, int __count) {
return _mm_slli_pi16(__m, __count);
}
/* Shift two 32-bit values in M left by COUNT. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_pi32(__m64 __m, __m64 __count) {
__m64_union m, res;
m.as_m64 = __m;
res.as_int[0] = m.as_int[0] << __count;
res.as_int[1] = m.as_int[1] << __count;
return (res.as_m64);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pslld(__m64 __m, __m64 __count) {
return _mm_sll_pi32(__m, __count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_pi32(__m64 __m, int __count) {
/* Promote int to long then invoke mm_sll_pi32. */
return _mm_sll_pi32(__m, __count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pslldi(__m64 __m, int __count) {
return _mm_slli_pi32(__m, __count);
}
/* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sra_pi16(__m64 __m, __m64 __count) {
__vector signed short m, r;
__vector unsigned short c;
if (__count <= 15) {
m = (__vector signed short)vec_splats(__m);
c = (__vector unsigned short)vec_splats((unsigned short)__count);
r = vec_sra(m, (__vector unsigned short)c);
return (__m64)((__vector long long)r)[0];
} else
return (0);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psraw(__m64 __m, __m64 __count) {
return _mm_sra_pi16(__m, __count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srai_pi16(__m64 __m, int __count) {
/* Promote int to long then invoke mm_sra_pi32. */
return _mm_sra_pi16(__m, __count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psrawi(__m64 __m, int __count) {
return _mm_srai_pi16(__m, __count);
}
/* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sra_pi32(__m64 __m, __m64 __count) {
__m64_union m, res;
m.as_m64 = __m;
res.as_int[0] = m.as_int[0] >> __count;
res.as_int[1] = m.as_int[1] >> __count;
return (res.as_m64);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psrad(__m64 __m, __m64 __count) {
return _mm_sra_pi32(__m, __count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srai_pi32(__m64 __m, int __count) {
/* Promote int to long then invoke mm_sra_pi32. */
return _mm_sra_pi32(__m, __count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psradi(__m64 __m, int __count) {
return _mm_srai_pi32(__m, __count);
}
/* Shift four 16-bit values in M right by COUNT; shift in zeros. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_pi16(__m64 __m, __m64 __count) {
__vector unsigned short m, r;
__vector unsigned short c;
if (__count <= 15) {
m = (__vector unsigned short)vec_splats(__m);
c = (__vector unsigned short)vec_splats((unsigned short)__count);
r = vec_sr(m, (__vector unsigned short)c);
return (__m64)((__vector long long)r)[0];
} else
return (0);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psrlw(__m64 __m, __m64 __count) {
return _mm_srl_pi16(__m, __count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_pi16(__m64 __m, int __count) {
/* Promote int to long then invoke mm_sra_pi32. */
return _mm_srl_pi16(__m, __count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psrlwi(__m64 __m, int __count) {
return _mm_srli_pi16(__m, __count);
}
/* Shift two 32-bit values in M right by COUNT; shift in zeros. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_pi32(__m64 __m, __m64 __count) {
__m64_union m, res;
m.as_m64 = __m;
res.as_int[0] = (unsigned int)m.as_int[0] >> __count;
res.as_int[1] = (unsigned int)m.as_int[1] >> __count;
return (res.as_m64);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psrld(__m64 __m, __m64 __count) {
return _mm_srl_pi32(__m, __count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_pi32(__m64 __m, int __count) {
/* Promote int to long then invoke mm_srl_pi32. */
return _mm_srl_pi32(__m, __count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psrldi(__m64 __m, int __count) {
return _mm_srli_pi32(__m, __count);
}
#endif /* _ARCH_PWR8 */
/* Creates a vector of two 32-bit values; I0 is least significant. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_pi32(int __i1, int __i0) {
__m64_union res;
res.as_int[0] = __i0;
res.as_int[1] = __i1;
return (res.as_m64);
}
/* Creates a vector of four 16-bit values; W0 is least significant. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_pi16(short __w3, short __w2, short __w1, short __w0) {
__m64_union res;
res.as_short[0] = __w0;
res.as_short[1] = __w1;
res.as_short[2] = __w2;
res.as_short[3] = __w3;
return (res.as_m64);
}
/* Creates a vector of eight 8-bit values; B0 is least significant. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3,
char __b2, char __b1, char __b0) {
__m64_union res;
res.as_char[0] = __b0;
res.as_char[1] = __b1;
res.as_char[2] = __b2;
res.as_char[3] = __b3;
res.as_char[4] = __b4;
res.as_char[5] = __b5;
res.as_char[6] = __b6;
res.as_char[7] = __b7;
return (res.as_m64);
}
/* Similar, but with the arguments in reverse order. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_pi32(int __i0, int __i1) {
__m64_union res;
res.as_int[0] = __i0;
res.as_int[1] = __i1;
return (res.as_m64);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) {
return _mm_set_pi16(__w3, __w2, __w1, __w0);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4,
char __b5, char __b6, char __b7) {
return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
}
/* Creates a vector of two 32-bit values, both elements containing I. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_pi32(int __i) {
__m64_union res;
res.as_int[0] = __i;
res.as_int[1] = __i;
return (res.as_m64);
}
/* Creates a vector of four 16-bit values, all elements containing W. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_pi16(short __w) {
#if _ARCH_PWR9
__vector signed short w;
w = (__vector signed short)vec_splats(__w);
return (__m64)((__vector long long)w)[0];
#else
__m64_union res;
res.as_short[0] = __w;
res.as_short[1] = __w;
res.as_short[2] = __w;
res.as_short[3] = __w;
return (res.as_m64);
#endif
}
/* Creates a vector of eight 8-bit values, all elements containing B. */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_pi8(signed char __b) {
#if _ARCH_PWR8
__vector signed char b;
b = (__vector signed char)vec_splats(__b);
return (__m64)((__vector long long)b)[0];
#else
__m64_union res;
res.as_char[0] = __b;
res.as_char[1] = __b;
res.as_char[2] = __b;
res.as_char[3] = __b;
res.as_char[4] = __b;
res.as_char[5] = __b;
res.as_char[6] = __b;
res.as_char[7] = __b;
return (res.as_m64);
#endif
}
#endif /* _MMINTRIN_H_INCLUDED */
+1838
View File
@@ -0,0 +1,1838 @@
/*===---- xmmintrin.h - Implementation of SSE intrinsics on PowerPC --------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
/* Implemented from the specification included in the Intel C++ Compiler
User Guide and Reference, version 9.0. */
#ifndef NO_WARN_X86_INTRINSICS
/* This header file is to help porting code using Intel intrinsics
explicitly from x86_64 to powerpc64/powerpc64le.
Since X86 SSE intrinsics mainly handles __m128 type, PowerPC
VMX/VSX ISA is a good match for vector float SIMD operations.
However scalar float operations in vector (XMM) registers require
the POWER8 VSX ISA (2.07) level. There are differences for data
format and placement of float scalars in the vector register, which
require extra steps to match SSE scalar float semantics on POWER.
It should be noted that there's much difference between X86_64's
MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
portable <fenv.h> instead of access MXSCR directly.
Most SSE scalar float intrinsic operations can be performed more
efficiently as C language float scalar operations or optimized to
use vector SIMD operations. We recommend this for new applications. */
#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
#endif
#ifndef _XMMINTRIN_H_INCLUDED
#define _XMMINTRIN_H_INCLUDED
/* Define four value permute mask */
#define _MM_SHUFFLE(w,x,y,z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z))
#include <altivec.h>
/* Avoid collisions between altivec.h and strict adherence to C++ and
C11 standards. This should eventually be done inside altivec.h itself,
but only after testing a full distro build. */
#if defined(__STRICT_ANSI__) && (defined(__cplusplus) || \
(defined(__STDC_VERSION__) && \
__STDC_VERSION__ >= 201112L))
#undef vector
#undef pixel
#undef bool
#endif
/* We need type definitions from the MMX header file. */
#include <mmintrin.h>
/* Get _mm_malloc () and _mm_free (). */
#if __STDC_HOSTED__
#include <mm_malloc.h>
#endif
/* The Intel API is flexible enough that we must allow aliasing with other
vector types, and their scalar components. */
typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
/* Unaligned version of the same type. */
typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__,
__aligned__ (1)));
/* Internal data types for implementing the intrinsics. */
typedef float __v4sf __attribute__ ((__vector_size__ (16)));
/* Create an undefined vector. */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_undefined_ps (void)
{
__m128 __Y = __Y;
return __Y;
}
/* Create a vector of zeros. */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setzero_ps (void)
{
return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
}
/* Load four SPFP values from P. The address must be 16-byte aligned. */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_ps (float const *__P)
{
return ((__m128)vec_ld(0, (__v4sf*)__P));
}
/* Load four SPFP values from P. The address need not be 16-byte aligned. */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadu_ps (float const *__P)
{
return (vec_vsx_ld(0, __P));
}
/* Load four SPFP values in reverse order. The address must be aligned. */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadr_ps (float const *__P)
{
__v4sf __tmp;
__m128 result;
static const __vector unsigned char permute_vector =
{ 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
0x17, 0x10, 0x11, 0x12, 0x13 };
__tmp = vec_ld (0, (__v4sf *) __P);
result = (__m128) vec_perm (__tmp, __tmp, permute_vector);
return result;
}
/* Create a vector with all four elements equal to F. */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_ps (float __F)
{
return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_ps1 (float __F)
{
return _mm_set1_ps (__F);
}
/* Create the vector [Z Y X W]. */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
{
return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };
}
/* Create the vector [W X Y Z]. */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_ps (float __Z, float __Y, float __X, float __W)
{
return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
}
/* Store four SPFP values. The address must be 16-byte aligned. */
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_ps (float *__P, __m128 __A)
{
vec_st((__v4sf)__A, 0, (__v4sf*)__P);
}
/* Store four SPFP values. The address need not be 16-byte aligned. */
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeu_ps (float *__P, __m128 __A)
{
*(__m128_u *)__P = __A;
}
/* Store four SPFP values in reverse order. The address must be aligned. */
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storer_ps (float *__P, __m128 __A)
{
__v4sf __tmp;
static const __vector unsigned char permute_vector =
{ 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
0x17, 0x10, 0x11, 0x12, 0x13 };
__tmp = (__m128) vec_perm (__A, __A, permute_vector);
_mm_store_ps (__P, __tmp);
}
/* Store the lower SPFP value across four words. */
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store1_ps (float *__P, __m128 __A)
{
__v4sf __va = vec_splat((__v4sf)__A, 0);
_mm_store_ps (__P, __va);
}
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_ps1 (float *__P, __m128 __A)
{
_mm_store1_ps (__P, __A);
}
/* Create a vector with element 0 as F and the rest zero. */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_ss (float __F)
{
return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f };
}
/* Sets the low SPFP value of A from the low value of B. */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_move_ss (__m128 __A, __m128 __B)
{
static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
return (vec_sel ((__v4sf)__A, (__v4sf)__B, mask));
}
/* Create a vector with element 0 as *P and the rest zero. */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_ss (float const *__P)
{
return _mm_set_ss (*__P);
}
/* Stores the lower SPFP value. */
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_ss (float *__P, __m128 __A)
{
*__P = ((__v4sf)__A)[0];
}
/* Perform the respective operation on the lower SPFP (single-precision
floating-point) values of A and B; the upper three SPFP values are
passed through from A. */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_ss (__m128 __A, __m128 __B)
{
#ifdef _ARCH_PWR7
__m128 a, b, c;
static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
/* PowerISA VSX does not allow partial (for just lower double)
results. So to insure we don't generate spurious exceptions
(from the upper double values) we splat the lower double
before we to the operation. */
a = vec_splat (__A, 0);
b = vec_splat (__B, 0);
c = a + b;
/* Then we merge the lower float result with the original upper
float elements from __A. */
return (vec_sel (__A, c, mask));
#else
__A[0] = __A[0] + __B[0];
return (__A);
#endif
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_ss (__m128 __A, __m128 __B)
{
#ifdef _ARCH_PWR7
__m128 a, b, c;
static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
/* PowerISA VSX does not allow partial (for just lower double)
results. So to insure we don't generate spurious exceptions
(from the upper double values) we splat the lower double
before we to the operation. */
a = vec_splat (__A, 0);
b = vec_splat (__B, 0);
c = a - b;
/* Then we merge the lower float result with the original upper
float elements from __A. */
return (vec_sel (__A, c, mask));
#else
__A[0] = __A[0] - __B[0];
return (__A);
#endif
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_ss (__m128 __A, __m128 __B)
{
#ifdef _ARCH_PWR7
__m128 a, b, c;
static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
/* PowerISA VSX does not allow partial (for just lower double)
results. So to insure we don't generate spurious exceptions
(from the upper double values) we splat the lower double
before we to the operation. */
a = vec_splat (__A, 0);
b = vec_splat (__B, 0);
c = a * b;
/* Then we merge the lower float result with the original upper
float elements from __A. */
return (vec_sel (__A, c, mask));
#else
__A[0] = __A[0] * __B[0];
return (__A);
#endif
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_div_ss (__m128 __A, __m128 __B)
{
#ifdef _ARCH_PWR7
__m128 a, b, c;
static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
/* PowerISA VSX does not allow partial (for just lower double)
results. So to insure we don't generate spurious exceptions
(from the upper double values) we splat the lower double
before we to the operation. */
a = vec_splat (__A, 0);
b = vec_splat (__B, 0);
c = a / b;
/* Then we merge the lower float result with the original upper
float elements from __A. */
return (vec_sel (__A, c, mask));
#else
__A[0] = __A[0] / __B[0];
return (__A);
#endif
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sqrt_ss (__m128 __A)
{
__m128 a, c;
static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
/* PowerISA VSX does not allow partial (for just lower double)
* results. So to insure we don't generate spurious exceptions
* (from the upper double values) we splat the lower double
* before we to the operation. */
a = vec_splat (__A, 0);
c = vec_sqrt (a);
/* Then we merge the lower float result with the original upper
* float elements from __A. */
return (vec_sel (__A, c, mask));
}
/* Perform the respective operation on the four SPFP values in A and B. */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_ps (__m128 __A, __m128 __B)
{
return (__m128) ((__v4sf)__A + (__v4sf)__B);
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_ps (__m128 __A, __m128 __B)
{
return (__m128) ((__v4sf)__A - (__v4sf)__B);
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_ps (__m128 __A, __m128 __B)
{
return (__m128) ((__v4sf)__A * (__v4sf)__B);
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_div_ps (__m128 __A, __m128 __B)
{
return (__m128) ((__v4sf)__A / (__v4sf)__B);
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sqrt_ps (__m128 __A)
{
return (vec_sqrt ((__v4sf)__A));
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_rcp_ps (__m128 __A)
{
return (vec_re ((__v4sf)__A));
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_rsqrt_ps (__m128 __A)
{
return (vec_rsqrte (__A));
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_rcp_ss (__m128 __A)
{
__m128 a, c;
static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
/* PowerISA VSX does not allow partial (for just lower double)
* results. So to insure we don't generate spurious exceptions
* (from the upper double values) we splat the lower double
* before we to the operation. */
a = vec_splat (__A, 0);
c = _mm_rcp_ps (a);
/* Then we merge the lower float result with the original upper
* float elements from __A. */
return (vec_sel (__A, c, mask));
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_rsqrt_ss (__m128 __A)
{
__m128 a, c;
static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
/* PowerISA VSX does not allow partial (for just lower double)
* results. So to insure we don't generate spurious exceptions
* (from the upper double values) we splat the lower double
* before we to the operation. */
a = vec_splat (__A, 0);
c = vec_rsqrte (a);
/* Then we merge the lower float result with the original upper
* float elements from __A. */
return (vec_sel (__A, c, mask));
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_ss (__m128 __A, __m128 __B)
{
__v4sf a, b, c;
static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
/* PowerISA VSX does not allow partial (for just lower float)
* results. So to insure we don't generate spurious exceptions
* (from the upper float values) we splat the lower float
* before we to the operation. */
a = vec_splat ((__v4sf)__A, 0);
b = vec_splat ((__v4sf)__B, 0);
c = vec_min (a, b);
/* Then we merge the lower float result with the original upper
* float elements from __A. */
return (vec_sel ((__v4sf)__A, c, mask));
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_ss (__m128 __A, __m128 __B)
{
__v4sf a, b, c;
static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
/* PowerISA VSX does not allow partial (for just lower float)
* results. So to insure we don't generate spurious exceptions
* (from the upper float values) we splat the lower float
* before we to the operation. */
a = vec_splat (__A, 0);
b = vec_splat (__B, 0);
c = vec_max (a, b);
/* Then we merge the lower float result with the original upper
* float elements from __A. */
return (vec_sel ((__v4sf)__A, c, mask));
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_ps (__m128 __A, __m128 __B)
{
__vector __bool int m = vec_cmpgt ((__v4sf) __B, (__v4sf) __A);
return vec_sel (__B, __A, m);
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_ps (__m128 __A, __m128 __B)
{
__vector __bool int m = vec_cmpgt ((__v4sf) __A, (__v4sf) __B);
return vec_sel (__B, __A, m);
}
/* Perform logical bit-wise operations on 128-bit values. */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_and_ps (__m128 __A, __m128 __B)
{
return ((__m128)vec_and ((__v4sf)__A, (__v4sf)__B));
// return __builtin_ia32_andps (__A, __B);
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_andnot_ps (__m128 __A, __m128 __B)
{
return ((__m128)vec_andc ((__v4sf)__B, (__v4sf)__A));
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_or_ps (__m128 __A, __m128 __B)
{
return ((__m128)vec_or ((__v4sf)__A, (__v4sf)__B));
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_xor_ps (__m128 __A, __m128 __B)
{
return ((__m128)vec_xor ((__v4sf)__A, (__v4sf)__B));
}
/* Perform a comparison on the four SPFP values of A and B. For each
element, if the comparison is true, place a mask of all ones in the
result, otherwise a mask of zeros. */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_ps (__m128 __A, __m128 __B)
{
return ((__m128)vec_cmpeq ((__v4sf)__A,(__v4sf) __B));
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_ps (__m128 __A, __m128 __B)
{
return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmple_ps (__m128 __A, __m128 __B)
{
return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_ps (__m128 __A, __m128 __B)
{
return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpge_ps (__m128 __A, __m128 __B)
{
return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpneq_ps (__m128 __A, __m128 __B)
{
__v4sf temp = (__v4sf ) vec_cmpeq ((__v4sf) __A, (__v4sf)__B);
return ((__m128)vec_nor (temp, temp));
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnlt_ps (__m128 __A, __m128 __B)
{
return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnle_ps (__m128 __A, __m128 __B)
{
return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpngt_ps (__m128 __A, __m128 __B)
{
return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnge_ps (__m128 __A, __m128 __B)
{
return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpord_ps (__m128 __A, __m128 __B)
{
__vector unsigned int a, b;
__vector unsigned int c, d;
static const __vector unsigned int float_exp_mask =
{ 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
a = (__vector unsigned int) vec_abs ((__v4sf)__A);
b = (__vector unsigned int) vec_abs ((__v4sf)__B);
c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
return ((__m128 ) vec_and (c, d));
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpunord_ps (__m128 __A, __m128 __B)
{
__vector unsigned int a, b;
__vector unsigned int c, d;
static const __vector unsigned int float_exp_mask =
{ 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
a = (__vector unsigned int) vec_abs ((__v4sf)__A);
b = (__vector unsigned int) vec_abs ((__v4sf)__B);
c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
return ((__m128 ) vec_or (c, d));
}
/* Perform a comparison on the lower SPFP values of A and B. If the
comparison is true, place a mask of all ones in the result, otherwise a
mask of zeros. The upper three SPFP values are passed through from A. */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_ss (__m128 __A, __m128 __B)
{
static const __vector unsigned int mask =
{ 0xffffffff, 0, 0, 0 };
__v4sf a, b, c;
/* PowerISA VMX does not allow partial (for just element 0)
* results. So to insure we don't generate spurious exceptions
* (from the upper elements) we splat the lower float
* before we to the operation. */
a = vec_splat ((__v4sf) __A, 0);
b = vec_splat ((__v4sf) __B, 0);
c = (__v4sf) vec_cmpeq(a, b);
/* Then we merge the lower float result with the original upper
* float elements from __A. */
return ((__m128)vec_sel ((__v4sf)__A, c, mask));
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_ss (__m128 __A, __m128 __B)
{
static const __vector unsigned int mask =
{ 0xffffffff, 0, 0, 0 };
__v4sf a, b, c;
/* PowerISA VMX does not allow partial (for just element 0)
* results. So to insure we don't generate spurious exceptions
* (from the upper elements) we splat the lower float
* before we to the operation. */
a = vec_splat ((__v4sf) __A, 0);
b = vec_splat ((__v4sf) __B, 0);
c = (__v4sf) vec_cmplt(a, b);
/* Then we merge the lower float result with the original upper
* float elements from __A. */
return ((__m128)vec_sel ((__v4sf)__A, c, mask));
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmple_ss (__m128 __A, __m128 __B)
{
static const __vector unsigned int mask =
{ 0xffffffff, 0, 0, 0 };
__v4sf a, b, c;
/* PowerISA VMX does not allow partial (for just element 0)
* results. So to insure we don't generate spurious exceptions
* (from the upper elements) we splat the lower float
* before we to the operation. */
a = vec_splat ((__v4sf) __A, 0);
b = vec_splat ((__v4sf) __B, 0);
c = (__v4sf) vec_cmple(a, b);
/* Then we merge the lower float result with the original upper
* float elements from __A. */
return ((__m128)vec_sel ((__v4sf)__A, c, mask));
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_ss (__m128 __A, __m128 __B)
{
static const __vector unsigned int mask =
{ 0xffffffff, 0, 0, 0 };
__v4sf a, b, c;
/* PowerISA VMX does not allow partial (for just element 0)
* results. So to insure we don't generate spurious exceptions
* (from the upper elements) we splat the lower float
* before we to the operation. */
a = vec_splat ((__v4sf) __A, 0);
b = vec_splat ((__v4sf) __B, 0);
c = (__v4sf) vec_cmpgt(a, b);
/* Then we merge the lower float result with the original upper
* float elements from __A. */
return ((__m128)vec_sel ((__v4sf)__A, c, mask));
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpge_ss (__m128 __A, __m128 __B)
{
static const __vector unsigned int mask =
{ 0xffffffff, 0, 0, 0 };
__v4sf a, b, c;
/* PowerISA VMX does not allow partial (for just element 0)
* results. So to insure we don't generate spurious exceptions
* (from the upper elements) we splat the lower float
* before we to the operation. */
a = vec_splat ((__v4sf) __A, 0);
b = vec_splat ((__v4sf) __B, 0);
c = (__v4sf) vec_cmpge(a, b);
/* Then we merge the lower float result with the original upper
* float elements from __A. */
return ((__m128)vec_sel ((__v4sf)__A, c, mask));
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpneq_ss (__m128 __A, __m128 __B)
{
static const __vector unsigned int mask =
{ 0xffffffff, 0, 0, 0 };
__v4sf a, b, c;
/* PowerISA VMX does not allow partial (for just element 0)
* results. So to insure we don't generate spurious exceptions
* (from the upper elements) we splat the lower float
* before we to the operation. */
a = vec_splat ((__v4sf) __A, 0);
b = vec_splat ((__v4sf) __B, 0);
c = (__v4sf) vec_cmpeq(a, b);
c = vec_nor (c, c);
/* Then we merge the lower float result with the original upper
* float elements from __A. */
return ((__m128)vec_sel ((__v4sf)__A, c, mask));
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnlt_ss (__m128 __A, __m128 __B)
{
static const __vector unsigned int mask =
{ 0xffffffff, 0, 0, 0 };
__v4sf a, b, c;
/* PowerISA VMX does not allow partial (for just element 0)
* results. So to insure we don't generate spurious exceptions
* (from the upper elements) we splat the lower float
* before we to the operation. */
a = vec_splat ((__v4sf) __A, 0);
b = vec_splat ((__v4sf) __B, 0);
c = (__v4sf) vec_cmpge(a, b);
/* Then we merge the lower float result with the original upper
* float elements from __A. */
return ((__m128)vec_sel ((__v4sf)__A, c, mask));
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnle_ss (__m128 __A, __m128 __B)
{
static const __vector unsigned int mask =
{ 0xffffffff, 0, 0, 0 };
__v4sf a, b, c;
/* PowerISA VMX does not allow partial (for just element 0)
* results. So to insure we don't generate spurious exceptions
* (from the upper elements) we splat the lower float
* before we to the operation. */
a = vec_splat ((__v4sf) __A, 0);
b = vec_splat ((__v4sf) __B, 0);
c = (__v4sf) vec_cmpgt(a, b);
/* Then we merge the lower float result with the original upper
* float elements from __A. */
return ((__m128)vec_sel ((__v4sf)__A, c, mask));
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpngt_ss (__m128 __A, __m128 __B)
{
static const __vector unsigned int mask =
{ 0xffffffff, 0, 0, 0 };
__v4sf a, b, c;
/* PowerISA VMX does not allow partial (for just element 0)
* results. So to insure we don't generate spurious exceptions
* (from the upper elements) we splat the lower float
* before we to the operation. */
a = vec_splat ((__v4sf) __A, 0);
b = vec_splat ((__v4sf) __B, 0);
c = (__v4sf) vec_cmple(a, b);
/* Then we merge the lower float result with the original upper
* float elements from __A. */
return ((__m128)vec_sel ((__v4sf)__A, c, mask));
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnge_ss (__m128 __A, __m128 __B)
{
static const __vector unsigned int mask =
{ 0xffffffff, 0, 0, 0 };
__v4sf a, b, c;
/* PowerISA VMX does not allow partial (for just element 0)
* results. So to insure we don't generate spurious exceptions
* (from the upper elements) we splat the lower float
* before we do the operation. */
a = vec_splat ((__v4sf) __A, 0);
b = vec_splat ((__v4sf) __B, 0);
c = (__v4sf) vec_cmplt(a, b);
/* Then we merge the lower float result with the original upper
* float elements from __A. */
return ((__m128)vec_sel ((__v4sf)__A, c, mask));
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpord_ss (__m128 __A, __m128 __B)
{
__vector unsigned int a, b;
__vector unsigned int c, d;
static const __vector unsigned int float_exp_mask =
{ 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
static const __vector unsigned int mask =
{ 0xffffffff, 0, 0, 0 };
a = (__vector unsigned int) vec_abs ((__v4sf)__A);
b = (__vector unsigned int) vec_abs ((__v4sf)__B);
c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
c = vec_and (c, d);
/* Then we merge the lower float result with the original upper
* float elements from __A. */
return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpunord_ss (__m128 __A, __m128 __B)
{
__vector unsigned int a, b;
__vector unsigned int c, d;
static const __vector unsigned int float_exp_mask =
{ 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
static const __vector unsigned int mask =
{ 0xffffffff, 0, 0, 0 };
a = (__vector unsigned int) vec_abs ((__v4sf)__A);
b = (__vector unsigned int) vec_abs ((__v4sf)__B);
c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
c = vec_or (c, d);
/* Then we merge the lower float result with the original upper
* float elements from __A. */
return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
}
/* Compare the lower SPFP values of A and B and return 1 if true
and 0 if false. */
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comieq_ss (__m128 __A, __m128 __B)
{
return (__A[0] == __B[0]);
}
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comilt_ss (__m128 __A, __m128 __B)
{
return (__A[0] < __B[0]);
}
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comile_ss (__m128 __A, __m128 __B)
{
return (__A[0] <= __B[0]);
}
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comigt_ss (__m128 __A, __m128 __B)
{
return (__A[0] > __B[0]);
}
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comige_ss (__m128 __A, __m128 __B)
{
return (__A[0] >= __B[0]);
}
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comineq_ss (__m128 __A, __m128 __B)
{
return (__A[0] != __B[0]);
}
/* FIXME
* The __mm_ucomi??_ss implementations below are exactly the same as
* __mm_comi??_ss because GCC for PowerPC only generates unordered
* compares (scalar and vector).
* Technically __mm_comieq_ss et al should be using the ordered
* compare and signal for QNaNs.
* The __mm_ucomieq_sd et all should be OK, as is.
*/
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomieq_ss (__m128 __A, __m128 __B)
{
return (__A[0] == __B[0]);
}
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomilt_ss (__m128 __A, __m128 __B)
{
return (__A[0] < __B[0]);
}
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomile_ss (__m128 __A, __m128 __B)
{
return (__A[0] <= __B[0]);
}
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomigt_ss (__m128 __A, __m128 __B)
{
return (__A[0] > __B[0]);
}
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomige_ss (__m128 __A, __m128 __B)
{
return (__A[0] >= __B[0]);
}
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomineq_ss (__m128 __A, __m128 __B)
{
return (__A[0] != __B[0]);
}
extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtss_f32 (__m128 __A)
{
return ((__v4sf)__A)[0];
}
/* Convert the lower SPFP value to a 32-bit integer according to the current
rounding mode. */
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtss_si32 (__m128 __A)
{
__m64 res = 0;
#ifdef _ARCH_PWR8
double dtmp;
__asm__(
#ifdef __LITTLE_ENDIAN__
"xxsldwi %x0,%x0,%x0,3;\n"
#endif
"xscvspdp %x2,%x0;\n"
"fctiw %2,%2;\n"
"mfvsrd %1,%x2;\n"
: "+wa" (__A),
"=r" (res),
"=f" (dtmp)
: );
#else
res = __builtin_rint(__A[0]);
#endif
return (res);
}
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvt_ss2si (__m128 __A)
{
return _mm_cvtss_si32 (__A);
}
/* Convert the lower SPFP value to a 32-bit integer according to the
current rounding mode. */
/* Intel intrinsic. */
extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtss_si64 (__m128 __A)
{
__m64 res = 0;
#ifdef _ARCH_PWR8
double dtmp;
__asm__(
#ifdef __LITTLE_ENDIAN__
"xxsldwi %x0,%x0,%x0,3;\n"
#endif
"xscvspdp %x2,%x0;\n"
"fctid %2,%2;\n"
"mfvsrd %1,%x2;\n"
: "+wa" (__A),
"=r" (res),
"=f" (dtmp)
: );
#else
res = __builtin_llrint(__A[0]);
#endif
return (res);
}
/* Microsoft intrinsic. */
extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtss_si64x (__m128 __A)
{
return _mm_cvtss_si64 ((__v4sf) __A);
}
/* Constants for use with _mm_prefetch. */
enum _mm_hint
{
/* _MM_HINT_ET is _MM_HINT_T with set 3rd bit. */
_MM_HINT_ET0 = 7,
_MM_HINT_ET1 = 6,
_MM_HINT_T0 = 3,
_MM_HINT_T1 = 2,
_MM_HINT_T2 = 1,
_MM_HINT_NTA = 0
};
/* Loads one cache line from address P to a location "closer" to the
processor. The selector I specifies the type of prefetch operation. */
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_prefetch (const void *__P, enum _mm_hint __I)
{
/* Current PowerPC will ignores the hint parameters. */
__builtin_prefetch (__P);
}
/* Convert the two lower SPFP values to 32-bit integers according to the
current rounding mode. Return the integers in packed form. */
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtps_pi32 (__m128 __A)
{
/* Splat two lower SPFP values to both halves. */
__v4sf temp, rounded;
__vector unsigned long long result;
/* Splat two lower SPFP values to both halves. */
temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
rounded = vec_rint(temp);
result = (__vector unsigned long long) vec_cts (rounded, 0);
return (__m64) ((__vector long long) result)[0];
}
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvt_ps2pi (__m128 __A)
{
return _mm_cvtps_pi32 (__A);
}
/* Truncate the lower SPFP value to a 32-bit integer. */
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttss_si32 (__m128 __A)
{
/* Extract the lower float element. */
float temp = __A[0];
/* truncate to 32-bit integer and return. */
return temp;
}
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtt_ss2si (__m128 __A)
{
return _mm_cvttss_si32 (__A);
}
/* Intel intrinsic. */
extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttss_si64 (__m128 __A)
{
/* Extract the lower float element. */
float temp = __A[0];
/* truncate to 32-bit integer and return. */
return temp;
}
/* Microsoft intrinsic. */
extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttss_si64x (__m128 __A)
{
/* Extract the lower float element. */
float temp = __A[0];
/* truncate to 32-bit integer and return. */
return temp;
}
/* Truncate the two lower SPFP values to 32-bit integers. Return the
integers in packed form. */
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttps_pi32 (__m128 __A)
{
__v4sf temp;
__vector unsigned long long result;
/* Splat two lower SPFP values to both halves. */
temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
result = (__vector unsigned long long) vec_cts (temp, 0);
return (__m64) ((__vector long long) result)[0];
}
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtt_ps2pi (__m128 __A)
{
return _mm_cvttps_pi32 (__A);
}
/* Convert B to a SPFP value and insert it as element zero in A. */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi32_ss (__m128 __A, int __B)
{
float temp = __B;
__A[0] = temp;
return __A;
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvt_si2ss (__m128 __A, int __B)
{
return _mm_cvtsi32_ss (__A, __B);
}
/* Convert B to a SPFP value and insert it as element zero in A. */
/* Intel intrinsic. */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64_ss (__m128 __A, long long __B)
{
float temp = __B;
__A[0] = temp;
return __A;
}
/* Microsoft intrinsic. */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64x_ss (__m128 __A, long long __B)
{
return _mm_cvtsi64_ss (__A, __B);
}
/* Convert the two 32-bit values in B to SPFP form and insert them
as the two lower elements in A. */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpi32_ps (__m128 __A, __m64 __B)
{
__vector signed int vm1;
__vector float vf1;
vm1 = (__vector signed int) (__vector unsigned long long) {__B, __B};
vf1 = (__vector float) vec_ctf (vm1, 0);
return ((__m128) (__vector unsigned long long)
{ ((__vector unsigned long long)vf1) [0],
((__vector unsigned long long)__A) [1]});
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvt_pi2ps (__m128 __A, __m64 __B)
{
return _mm_cvtpi32_ps (__A, __B);
}
/* Convert the four signed 16-bit values in A to SPFP form. */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpi16_ps (__m64 __A)
{
__vector signed short vs8;
__vector signed int vi4;
__vector float vf1;
vs8 = (__vector signed short) (__vector unsigned long long) { __A, __A };
vi4 = vec_vupklsh (vs8);
vf1 = (__vector float) vec_ctf (vi4, 0);
return (__m128) vf1;
}
/* Convert the four unsigned 16-bit values in A to SPFP form. */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpu16_ps (__m64 __A)
{
const __vector unsigned short zero =
{ 0, 0, 0, 0, 0, 0, 0, 0 };
__vector unsigned short vs8;
__vector unsigned int vi4;
__vector float vf1;
vs8 = (__vector unsigned short) (__vector unsigned long long) { __A, __A };
vi4 = (__vector unsigned int) vec_mergel
#ifdef __LITTLE_ENDIAN__
(vs8, zero);
#else
(zero, vs8);
#endif
vf1 = (__vector float) vec_ctf (vi4, 0);
return (__m128) vf1;
}
/* Convert the low four signed 8-bit values in A to SPFP form. */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpi8_ps (__m64 __A)
{
__vector signed char vc16;
__vector signed short vs8;
__vector signed int vi4;
__vector float vf1;
vc16 = (__vector signed char) (__vector unsigned long long) { __A, __A };
vs8 = vec_vupkhsb (vc16);
vi4 = vec_vupkhsh (vs8);
vf1 = (__vector float) vec_ctf (vi4, 0);
return (__m128) vf1;
}
/* Convert the low four unsigned 8-bit values in A to SPFP form. */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpu8_ps (__m64 __A)
{
const __vector unsigned char zero =
{ 0, 0, 0, 0, 0, 0, 0, 0 };
__vector unsigned char vc16;
__vector unsigned short vs8;
__vector unsigned int vi4;
__vector float vf1;
vc16 = (__vector unsigned char) (__vector unsigned long long) { __A, __A };
#ifdef __LITTLE_ENDIAN__
vs8 = (__vector unsigned short) vec_mergel (vc16, zero);
vi4 = (__vector unsigned int) vec_mergeh (vs8,
(__vector unsigned short) zero);
#else
vs8 = (__vector unsigned short) vec_mergel (zero, vc16);
vi4 = (__vector unsigned int) vec_mergeh ((__vector unsigned short) zero,
vs8);
#endif
vf1 = (__vector float) vec_ctf (vi4, 0);
return (__m128) vf1;
}
/* Convert the four signed 32-bit values in A and B to SPFP form. */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpi32x2_ps (__m64 __A, __m64 __B)
{
__vector signed int vi4;
__vector float vf4;
vi4 = (__vector signed int) (__vector unsigned long long) { __A, __B };
vf4 = (__vector float) vec_ctf (vi4, 0);
return (__m128) vf4;
}
/* Convert the four SPFP values in A to four signed 16-bit integers. */
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtps_pi16 (__m128 __A)
{
__v4sf rounded;
__vector signed int temp;
__vector unsigned long long result;
rounded = vec_rint(__A);
temp = vec_cts (rounded, 0);
result = (__vector unsigned long long) vec_pack (temp, temp);
return (__m64) ((__vector long long) result)[0];
}
/* Convert the four SPFP values in A to four signed 8-bit integers. */
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtps_pi8 (__m128 __A)
{
__v4sf rounded;
__vector signed int tmp_i;
static const __vector signed int zero = {0, 0, 0, 0};
__vector signed short tmp_s;
__vector signed char res_v;
rounded = vec_rint(__A);
tmp_i = vec_cts (rounded, 0);
tmp_s = vec_pack (tmp_i, zero);
res_v = vec_pack (tmp_s, tmp_s);
return (__m64) ((__vector long long) res_v)[0];
}
/* Selects four specific SPFP values from A and B based on MASK. */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_ps (__m128 __A, __m128 __B, int const __mask)
{
unsigned long element_selector_10 = __mask & 0x03;
unsigned long element_selector_32 = (__mask >> 2) & 0x03;
unsigned long element_selector_54 = (__mask >> 4) & 0x03;
unsigned long element_selector_76 = (__mask >> 6) & 0x03;
static const unsigned int permute_selectors[4] =
{
#ifdef __LITTLE_ENDIAN__
0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
#else
0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
#endif
};
__vector unsigned int t;
t[0] = permute_selectors[element_selector_10];
t[1] = permute_selectors[element_selector_32];
t[2] = permute_selectors[element_selector_54] + 0x10101010;
t[3] = permute_selectors[element_selector_76] + 0x10101010;
return vec_perm ((__v4sf) __A, (__v4sf)__B, (__vector unsigned char)t);
}
/* Selects and interleaves the upper two SPFP values from A and B. */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_ps (__m128 __A, __m128 __B)
{
return (__m128) vec_vmrglw ((__v4sf) __A, (__v4sf)__B);
}
/* Selects and interleaves the lower two SPFP values from A and B. */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_ps (__m128 __A, __m128 __B)
{
return (__m128) vec_vmrghw ((__v4sf) __A, (__v4sf)__B);
}
/* Sets the upper two SPFP values with 64-bits of data loaded from P;
the lower two values are passed through from A. */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadh_pi (__m128 __A, __m64 const *__P)
{
__vector unsigned long long __a = (__vector unsigned long long)__A;
__vector unsigned long long __p = vec_splats(*__P);
__a [1] = __p [1];
return (__m128)__a;
}
/* Stores the upper two SPFP values of A into P. */
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeh_pi (__m64 *__P, __m128 __A)
{
__vector unsigned long long __a = (__vector unsigned long long) __A;
*__P = __a[1];
}
/* Moves the upper two values of B into the lower two values of A. */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movehl_ps (__m128 __A, __m128 __B)
{
return (__m128) vec_mergel ((__vector unsigned long long)__B,
(__vector unsigned long long)__A);
}
/* Moves the lower two values of B into the upper two values of A. */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movelh_ps (__m128 __A, __m128 __B)
{
return (__m128) vec_mergeh ((__vector unsigned long long)__A,
(__vector unsigned long long)__B);
}
/* Sets the lower two SPFP values with 64-bits of data loaded from P;
the upper two values are passed through from A. */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadl_pi (__m128 __A, __m64 const *__P)
{
__vector unsigned long long __a = (__vector unsigned long long)__A;
__vector unsigned long long __p = vec_splats(*__P);
__a [0] = __p [0];
return (__m128)__a;
}
/* Stores the lower two SPFP values of A into P. */
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storel_pi (__m64 *__P, __m128 __A)
{
__vector unsigned long long __a = (__vector unsigned long long) __A;
*__P = __a[0];
}
#ifdef _ARCH_PWR8
/* Intrinsic functions that require PowerISA 2.07 minimum. */
/* Creates a 4-bit mask from the most significant bits of the SPFP values. */
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_ps (__m128 __A)
{
__vector unsigned long long result;
static const __vector unsigned int perm_mask =
{
#ifdef __LITTLE_ENDIAN__
0x00204060, 0x80808080, 0x80808080, 0x80808080
#else
0x80808080, 0x80808080, 0x80808080, 0x00204060
#endif
};
result = ((__vector unsigned long long)
vec_vbpermq ((__vector unsigned char) __A,
(__vector unsigned char) perm_mask));
#ifdef __LITTLE_ENDIAN__
return result[1];
#else
return result[0];
#endif
}
#endif /* _ARCH_PWR8 */
/* Create a vector with all four elements equal to *P. */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load1_ps (float const *__P)
{
return _mm_set1_ps (*__P);
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_ps1 (float const *__P)
{
return _mm_load1_ps (__P);
}
/* Extracts one of the four words of A. The selector N must be immediate. */
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_extract_pi16 (__m64 const __A, int const __N)
{
unsigned int shiftr = __N & 3;
#ifdef __BIG_ENDIAN__
shiftr = 3 - shiftr;
#endif
return ((__A >> (shiftr * 16)) & 0xffff);
}
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pextrw (__m64 const __A, int const __N)
{
return _mm_extract_pi16 (__A, __N);
}
/* Inserts word D into one of four words of A. The selector N must be
immediate. */
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_insert_pi16 (__m64 const __A, int const __D, int const __N)
{
const int shiftl = (__N & 3) * 16;
const __m64 shiftD = (const __m64) __D << shiftl;
const __m64 mask = 0xffffUL << shiftl;
__m64 result = (__A & (~mask)) | (shiftD & mask);
return (result);
}
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pinsrw (__m64 const __A, int const __D, int const __N)
{
return _mm_insert_pi16 (__A, __D, __N);
}
/* Compute the element-wise maximum of signed 16-bit values. */
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_pi16 (__m64 __A, __m64 __B)
{
#if _ARCH_PWR8
__vector signed short a, b, r;
__vector __bool short c;
a = (__vector signed short)vec_splats (__A);
b = (__vector signed short)vec_splats (__B);
c = (__vector __bool short)vec_cmpgt (a, b);
r = vec_sel (b, a, c);
return (__m64) ((__vector long long) r)[0];
#else
__m64_union m1, m2, res;
m1.as_m64 = __A;
m2.as_m64 = __B;
res.as_short[0] =
(m1.as_short[0] > m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
res.as_short[1] =
(m1.as_short[1] > m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
res.as_short[2] =
(m1.as_short[2] > m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
res.as_short[3] =
(m1.as_short[3] > m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
return (__m64) res.as_m64;
#endif
}
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pmaxsw (__m64 __A, __m64 __B)
{
return _mm_max_pi16 (__A, __B);
}
/* Compute the element-wise maximum of unsigned 8-bit values. */
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_pu8 (__m64 __A, __m64 __B)
{
#if _ARCH_PWR8
__vector unsigned char a, b, r;
__vector __bool char c;
a = (__vector unsigned char)vec_splats (__A);
b = (__vector unsigned char)vec_splats (__B);
c = (__vector __bool char)vec_cmpgt (a, b);
r = vec_sel (b, a, c);
return (__m64) ((__vector long long) r)[0];
#else
__m64_union m1, m2, res;
long i;
m1.as_m64 = __A;
m2.as_m64 = __B;
for (i = 0; i < 8; i++)
res.as_char[i] =
((unsigned char) m1.as_char[i] > (unsigned char) m2.as_char[i]) ?
m1.as_char[i] : m2.as_char[i];
return (__m64) res.as_m64;
#endif
}
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pmaxub (__m64 __A, __m64 __B)
{
return _mm_max_pu8 (__A, __B);
}
/* Compute the element-wise minimum of signed 16-bit values. */
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_pi16 (__m64 __A, __m64 __B)
{
#if _ARCH_PWR8
__vector signed short a, b, r;
__vector __bool short c;
a = (__vector signed short)vec_splats (__A);
b = (__vector signed short)vec_splats (__B);
c = (__vector __bool short)vec_cmplt (a, b);
r = vec_sel (b, a, c);
return (__m64) ((__vector long long) r)[0];
#else
__m64_union m1, m2, res;
m1.as_m64 = __A;
m2.as_m64 = __B;
res.as_short[0] =
(m1.as_short[0] < m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
res.as_short[1] =
(m1.as_short[1] < m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
res.as_short[2] =
(m1.as_short[2] < m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
res.as_short[3] =
(m1.as_short[3] < m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
return (__m64) res.as_m64;
#endif
}
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pminsw (__m64 __A, __m64 __B)
{
return _mm_min_pi16 (__A, __B);
}
/* Compute the element-wise minimum of unsigned 8-bit values. */
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_pu8 (__m64 __A, __m64 __B)
{
#if _ARCH_PWR8
__vector unsigned char a, b, r;
__vector __bool char c;
a = (__vector unsigned char)vec_splats (__A);
b = (__vector unsigned char)vec_splats (__B);
c = (__vector __bool char)vec_cmplt (a, b);
r = vec_sel (b, a, c);
return (__m64) ((__vector long long) r)[0];
#else
__m64_union m1, m2, res;
long i;
m1.as_m64 = __A;
m2.as_m64 = __B;
for (i = 0; i < 8; i++)
res.as_char[i] =
((unsigned char) m1.as_char[i] < (unsigned char) m2.as_char[i]) ?
m1.as_char[i] : m2.as_char[i];
return (__m64) res.as_m64;
#endif
}
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pminub (__m64 __A, __m64 __B)
{
return _mm_min_pu8 (__A, __B);
}
/* Create an 8-bit mask of the signs of 8-bit values. */
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_pi8 (__m64 __A)
{
unsigned long long p =
#ifdef __LITTLE_ENDIAN__
0x0008101820283038UL; // permute control for sign bits
#else
0x3830282018100800UL; // permute control for sign bits
#endif
return __builtin_bpermd (p, __A);
}
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pmovmskb (__m64 __A)
{
return _mm_movemask_pi8 (__A);
}
/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
in B and produce the high 16 bits of the 32-bit results. */
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhi_pu16 (__m64 __A, __m64 __B)
{
__vector unsigned short a, b;
__vector unsigned short c;
__vector unsigned int w0, w1;
__vector unsigned char xform1 = {
#ifdef __LITTLE_ENDIAN__
0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
#else
0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
#endif
};
a = (__vector unsigned short)vec_splats (__A);
b = (__vector unsigned short)vec_splats (__B);
w0 = vec_vmuleuh (a, b);
w1 = vec_vmulouh (a, b);
c = (__vector unsigned short)vec_perm (w0, w1, xform1);
return (__m64) ((__vector long long) c)[0];
}
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pmulhuw (__m64 __A, __m64 __B)
{
return _mm_mulhi_pu16 (__A, __B);
}
/* Return a combination of the four 16-bit values in A. The selector
must be an immediate. */
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_pi16 (__m64 __A, int const __N)
{
unsigned long element_selector_10 = __N & 0x03;
unsigned long element_selector_32 = (__N >> 2) & 0x03;
unsigned long element_selector_54 = (__N >> 4) & 0x03;
unsigned long element_selector_76 = (__N >> 6) & 0x03;
static const unsigned short permute_selectors[4] =
{
#ifdef __LITTLE_ENDIAN__
0x0908, 0x0B0A, 0x0D0C, 0x0F0E
#else
0x0607, 0x0405, 0x0203, 0x0001
#endif
};
__m64_union t;
__vector unsigned long long a, p, r;
#ifdef __LITTLE_ENDIAN__
t.as_short[0] = permute_selectors[element_selector_10];
t.as_short[1] = permute_selectors[element_selector_32];
t.as_short[2] = permute_selectors[element_selector_54];
t.as_short[3] = permute_selectors[element_selector_76];
#else
t.as_short[3] = permute_selectors[element_selector_10];
t.as_short[2] = permute_selectors[element_selector_32];
t.as_short[1] = permute_selectors[element_selector_54];
t.as_short[0] = permute_selectors[element_selector_76];
#endif
p = vec_splats (t.as_m64);
a = vec_splats (__A);
r = vec_perm (a, a, (__vector unsigned char)p);
return (__m64) ((__vector long long) r)[0];
}
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pshufw (__m64 __A, int const __N)
{
return _mm_shuffle_pi16 (__A, __N);
}
/* Conditionally store byte elements of A into P. The high bit of each
byte in the selector N determines whether the corresponding byte from
A is stored. */
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
{
__m64 hibit = 0x8080808080808080UL;
__m64 mask, tmp;
__m64 *p = (__m64*)__P;
tmp = *p;
mask = _mm_cmpeq_pi8 ((__N & hibit), hibit);
tmp = (tmp & (~mask)) | (__A & mask);
*p = tmp;
}
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_maskmovq (__m64 __A, __m64 __N, char *__P)
{
_mm_maskmove_si64 (__A, __N, __P);
}
/* Compute the rounded averages of the unsigned 8-bit values in A and B. */
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_avg_pu8 (__m64 __A, __m64 __B)
{
__vector unsigned char a, b, c;
a = (__vector unsigned char)vec_splats (__A);
b = (__vector unsigned char)vec_splats (__B);
c = vec_avg (a, b);
return (__m64) ((__vector long long) c)[0];
}
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pavgb (__m64 __A, __m64 __B)
{
return _mm_avg_pu8 (__A, __B);
}
/* Compute the rounded averages of the unsigned 16-bit values in A and B. */
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_avg_pu16 (__m64 __A, __m64 __B)
{
__vector unsigned short a, b, c;
a = (__vector unsigned short)vec_splats (__A);
b = (__vector unsigned short)vec_splats (__B);
c = vec_avg (a, b);
return (__m64) ((__vector long long) c)[0];
}
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pavgw (__m64 __A, __m64 __B)
{
return _mm_avg_pu16 (__A, __B);
}
/* Compute the sum of the absolute differences of the unsigned 8-bit
values in A and B. Return the value in the lower 16-bit word; the
upper words are cleared. */
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sad_pu8 (__m64 __A, __m64 __B)
{
__vector unsigned char a, b;
__vector unsigned char vmin, vmax, vabsdiff;
__vector signed int vsum;
const __vector unsigned int zero =
{ 0, 0, 0, 0 };
__m64_union result = {0};
a = (__vector unsigned char) (__vector unsigned long long) { 0UL, __A };
b = (__vector unsigned char) (__vector unsigned long long) { 0UL, __B };
vmin = vec_min (a, b);
vmax = vec_max (a, b);
vabsdiff = vec_sub (vmax, vmin);
/* Sum four groups of bytes into integers. */
vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
/* Sum across four integers with integer result. */
vsum = vec_sums (vsum, (__vector signed int) zero);
/* The sum is in the right most 32-bits of the vector result.
Transfer to a GPR and truncate to 16 bits. */
result.as_short[0] = vsum[3];
return result.as_m64;
}
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psadbw (__m64 __A, __m64 __B)
{
return _mm_sad_pu8 (__A, __B);
}
/* Stores the data in A to the address P without polluting the caches. */
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_pi (__m64 *__P, __m64 __A)
{
/* Use the data cache block touch for store transient. */
__asm__ (
" dcbtstt 0,%0"
:
: "b" (__P)
: "memory"
);
*__P = __A;
}
/* Likewise. The address must be 16-byte aligned. */
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_ps (float *__P, __m128 __A)
{
/* Use the data cache block touch for store transient. */
__asm__ (
" dcbtstt 0,%0"
:
: "b" (__P)
: "memory"
);
_mm_store_ps (__P, __A);
}
/* Guarantees that every preceding store is globally visible before
any subsequent store. */
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sfence (void)
{
/* Generate a light weight sync. */
__atomic_thread_fence (__ATOMIC_RELEASE);
}
/* The execution of the next instruction is delayed by an implementation
specific amount of time. The instruction does not modify the
architectural state. This is after the pop_options pragma because
it does not require SSE support in the processor--the encoding is a
nop on processors that do not support it. */
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_pause (void)
{
/* There is no exact match with this construct, but the following is
close to the desired effect. */
#if _ARCH_PWR8
/* On power8 and later processors we can depend on Program Priority
(PRI) and associated "very low" PPI setting. Since we don't know
what PPI this thread is running at we: 1) save the current PRI
from the PPR SPR into a local GRP, 2) set the PRI to "very low*
via the special or 31,31,31 encoding. 3) issue an "isync" to
insure the PRI change takes effect before we execute any more
instructions.
Now we can execute a lwsync (release barrier) while we execute
this thread at "very low" PRI. Finally we restore the original
PRI and continue execution. */
unsigned long __PPR;
__asm__ volatile (
" mfppr %0;"
" or 31,31,31;"
" isync;"
" lwsync;"
" isync;"
" mtppr %0;"
: "=r" (__PPR)
:
: "memory"
);
#else
/* For older processor where we may not even have Program Priority
controls we can only depend on Heavy Weight Sync. */
__atomic_thread_fence (__ATOMIC_SEQ_CST);
#endif
}
/* Transpose the 4x4 matrix composed of row[0-3]. */
#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
do { \
__v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \
__v4sf __t0 = vec_vmrghw (__r0, __r1); \
__v4sf __t1 = vec_vmrghw (__r2, __r3); \
__v4sf __t2 = vec_vmrglw (__r0, __r1); \
__v4sf __t3 = vec_vmrglw (__r2, __r3); \
(row0) = (__v4sf)vec_mergeh ((__vector long long)__t0, \
(__vector long long)__t1); \
(row1) = (__v4sf)vec_mergel ((__vector long long)__t0, \
(__vector long long)__t1); \
(row2) = (__v4sf)vec_mergeh ((__vector long long)__t2, \
(__vector long long)__t3); \
(row3) = (__v4sf)vec_mergel ((__vector long long)__t2, \
(__vector long long)__t3); \
} while (0)
/* For backward source compatibility. */
//# include <emmintrin.h>
#endif /* _XMMINTRIN_H_INCLUDED */
+3 -17
View File
@@ -1,22 +1,8 @@
/*===---- prfchwintrin.h - PREFETCHW intrinsic -----------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+3 -17
View File
@@ -1,22 +1,8 @@
/*===------------ ptwriteintrin.h - PTWRITE intrinsic --------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+3 -17
View File
@@ -1,22 +1,8 @@
/*===---- rdseedintrin.h - RDSEED intrinsics -------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+3 -17
View File
@@ -1,22 +1,8 @@
/*===---- rtmintrin.h - RTM intrinsics -------------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
+3 -17
View File
@@ -1,22 +1,8 @@
/*===---- s390intrin.h - SystemZ intrinsics --------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/

Some files were not shown because too many files have changed in this diff Show More