mirror of
https://codeberg.org/ziglang/zig.git
synced 2026-04-27 19:09:47 +03:00
Update clang headers
llvm commit b2851aea80e5a8f0cfd6c3c5a56a6b00fb28c6b6
This commit is contained in:
+9
@@ -55,7 +55,9 @@ struct __cuda_builtin_threadIdx_t {
|
||||
__CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_tid_z());
|
||||
// threadIdx should be convertible to uint3 (in fact in nvcc, it *is* a
|
||||
// uint3). This function is defined after we pull in vector_types.h.
|
||||
__attribute__((device)) operator dim3() const;
|
||||
__attribute__((device)) operator uint3() const;
|
||||
|
||||
private:
|
||||
__CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_threadIdx_t);
|
||||
};
|
||||
@@ -66,7 +68,9 @@ struct __cuda_builtin_blockIdx_t {
|
||||
__CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_ctaid_z());
|
||||
// blockIdx should be convertible to uint3 (in fact in nvcc, it *is* a
|
||||
// uint3). This function is defined after we pull in vector_types.h.
|
||||
__attribute__((device)) operator dim3() const;
|
||||
__attribute__((device)) operator uint3() const;
|
||||
|
||||
private:
|
||||
__CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_blockIdx_t);
|
||||
};
|
||||
@@ -78,6 +82,8 @@ struct __cuda_builtin_blockDim_t {
|
||||
// blockDim should be convertible to dim3 (in fact in nvcc, it *is* a
|
||||
// dim3). This function is defined after we pull in vector_types.h.
|
||||
__attribute__((device)) operator dim3() const;
|
||||
__attribute__((device)) operator uint3() const;
|
||||
|
||||
private:
|
||||
__CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_blockDim_t);
|
||||
};
|
||||
@@ -89,6 +95,8 @@ struct __cuda_builtin_gridDim_t {
|
||||
// gridDim should be convertible to dim3 (in fact in nvcc, it *is* a
|
||||
// dim3). This function is defined after we pull in vector_types.h.
|
||||
__attribute__((device)) operator dim3() const;
|
||||
__attribute__((device)) operator uint3() const;
|
||||
|
||||
private:
|
||||
__CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_gridDim_t);
|
||||
};
|
||||
@@ -108,5 +116,6 @@ __attribute__((device)) const int warpSize = 32;
|
||||
#undef __CUDA_DEVICE_BUILTIN
|
||||
#undef __CUDA_BUILTIN_VAR
|
||||
#undef __CUDA_DISALLOW_BUILTINVAR_ACCESS
|
||||
#undef __DELETE
|
||||
|
||||
#endif /* __CUDA_BUILTIN_VARS_H */
|
||||
|
||||
Vendored
+46
-4
@@ -66,10 +66,38 @@ __DEVICE__ float frexp(float __arg, int *__exp) {
|
||||
}
|
||||
|
||||
// For inscrutable reasons, the CUDA headers define these functions for us on
|
||||
// Windows. For OpenMP we omit these as some old system headers have
|
||||
// non-conforming `isinf(float)` and `isnan(float)` implementations that return
|
||||
// an `int`. The system versions of these functions should be fine anyway.
|
||||
#if !defined(_MSC_VER) && !defined(__OPENMP_NVPTX__)
|
||||
// Windows.
|
||||
#if !defined(_MSC_VER) || defined(__OPENMP_NVPTX__)
|
||||
|
||||
// For OpenMP we work around some old system headers that have non-conforming
|
||||
// `isinf(float)` and `isnan(float)` implementations that return an `int`. We do
|
||||
// this by providing two versions of these functions, differing only in the
|
||||
// return type. To avoid conflicting definitions we disable implicit base
|
||||
// function generation. That means we will end up with two specializations, one
|
||||
// per type, but only one has a base function defined by the system header.
|
||||
#if defined(__OPENMP_NVPTX__)
|
||||
#pragma omp begin declare variant match( \
|
||||
implementation = {extension(disable_implicit_base)})
|
||||
|
||||
// FIXME: We lack an extension to customize the mangling of the variants, e.g.,
|
||||
// add a suffix. This means we would clash with the names of the variants
|
||||
// (note that we do not create implicit base functions here). To avoid
|
||||
// this clash we add a new trait to some of them that is always true
|
||||
// (this is LLVM after all ;)). It will only influence the mangled name
|
||||
// of the variants inside the inner region and avoid the clash.
|
||||
#pragma omp begin declare variant match(implementation = {vendor(llvm)})
|
||||
|
||||
__DEVICE__ int isinf(float __x) { return ::__isinff(__x); }
|
||||
__DEVICE__ int isinf(double __x) { return ::__isinf(__x); }
|
||||
__DEVICE__ int isfinite(float __x) { return ::__finitef(__x); }
|
||||
__DEVICE__ int isfinite(double __x) { return ::__isfinited(__x); }
|
||||
__DEVICE__ int isnan(float __x) { return ::__isnanf(__x); }
|
||||
__DEVICE__ int isnan(double __x) { return ::__isnan(__x); }
|
||||
|
||||
#pragma omp end declare variant
|
||||
|
||||
#endif
|
||||
|
||||
__DEVICE__ bool isinf(float __x) { return ::__isinff(__x); }
|
||||
__DEVICE__ bool isinf(double __x) { return ::__isinf(__x); }
|
||||
__DEVICE__ bool isfinite(float __x) { return ::__finitef(__x); }
|
||||
@@ -79,6 +107,11 @@ __DEVICE__ bool isfinite(float __x) { return ::__finitef(__x); }
|
||||
__DEVICE__ bool isfinite(double __x) { return ::__isfinited(__x); }
|
||||
__DEVICE__ bool isnan(float __x) { return ::__isnanf(__x); }
|
||||
__DEVICE__ bool isnan(double __x) { return ::__isnan(__x); }
|
||||
|
||||
#if defined(__OPENMP_NVPTX__)
|
||||
#pragma omp end declare variant
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
__DEVICE__ bool isgreater(float __x, float __y) {
|
||||
@@ -142,6 +175,15 @@ __DEVICE__ float sqrt(float __x) { return ::sqrtf(__x); }
|
||||
__DEVICE__ float tan(float __x) { return ::tanf(__x); }
|
||||
__DEVICE__ float tanh(float __x) { return ::tanhf(__x); }
|
||||
|
||||
// There was a redefinition error for this this overload in CUDA mode.
|
||||
// We restrict it to OpenMP mode for now, that is where it is actually needed
|
||||
// anyway.
|
||||
#ifdef __OPENMP_NVPTX__
|
||||
__DEVICE__ float remquo(float __n, float __d, int *__q) {
|
||||
return ::remquof(__n, __d, __q);
|
||||
}
|
||||
#endif
|
||||
|
||||
// Notably missing above is nexttoward. We omit it because
|
||||
// libdevice doesn't provide an implementation, and we don't want to be in the
|
||||
// business of implementing tricky libm functions in this header.
|
||||
|
||||
+28
-2
@@ -41,6 +41,27 @@
|
||||
#define _ABSf std::abs
|
||||
#define _LOGBd std::logb
|
||||
#define _LOGBf std::logb
|
||||
// Rather than pulling in std::max from algorithm everytime, use available ::max.
|
||||
#define _fmaxd max
|
||||
#define _fmaxf max
|
||||
#else
|
||||
#ifdef __AMDGCN__
|
||||
#define _ISNANd __ocml_isnan_f64
|
||||
#define _ISNANf __ocml_isnan_f32
|
||||
#define _ISINFd __ocml_isinf_f64
|
||||
#define _ISINFf __ocml_isinf_f32
|
||||
#define _ISFINITEd __ocml_isfinite_f64
|
||||
#define _ISFINITEf __ocml_isfinite_f32
|
||||
#define _COPYSIGNd __ocml_copysign_f64
|
||||
#define _COPYSIGNf __ocml_copysign_f32
|
||||
#define _SCALBNd __ocml_scalbn_f64
|
||||
#define _SCALBNf __ocml_scalbn_f32
|
||||
#define _ABSd __ocml_fabs_f64
|
||||
#define _ABSf __ocml_fabs_f32
|
||||
#define _LOGBd __ocml_logb_f64
|
||||
#define _LOGBf __ocml_logb_f32
|
||||
#define _fmaxd __ocml_fmax_f64
|
||||
#define _fmaxf __ocml_fmax_f32
|
||||
#else
|
||||
#define _ISNANd __nv_isnand
|
||||
#define _ISNANf __nv_isnanf
|
||||
@@ -56,6 +77,9 @@
|
||||
#define _ABSf __nv_fabsf
|
||||
#define _LOGBd __nv_logb
|
||||
#define _LOGBf __nv_logbf
|
||||
#define _fmaxd __nv_fmax
|
||||
#define _fmaxf __nv_fmaxf
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__cplusplus)
|
||||
@@ -167,7 +191,7 @@ __DEVICE__ double _Complex __divdc3(double __a, double __b, double __c,
|
||||
// Can't use std::max, because that's defined in <algorithm>, and we don't
|
||||
// want to pull that in for every compile. The CUDA headers define
|
||||
// ::max(float, float) and ::max(double, double), which is sufficient for us.
|
||||
double __logbw = _LOGBd(max(_ABSd(__c), _ABSd(__d)));
|
||||
double __logbw = _LOGBd(_fmaxd(_ABSd(__c), _ABSd(__d)));
|
||||
if (_ISFINITEd(__logbw)) {
|
||||
__ilogbw = (int)__logbw;
|
||||
__c = _SCALBNd(__c, -__ilogbw);
|
||||
@@ -200,7 +224,7 @@ __DEVICE__ double _Complex __divdc3(double __a, double __b, double __c,
|
||||
|
||||
__DEVICE__ float _Complex __divsc3(float __a, float __b, float __c, float __d) {
|
||||
int __ilogbw = 0;
|
||||
float __logbw = _LOGBf(max(_ABSf(__c), _ABSf(__d)));
|
||||
float __logbw = _LOGBf(_fmaxf(_ABSf(__c), _ABSf(__d)));
|
||||
if (_ISFINITEf(__logbw)) {
|
||||
__ilogbw = (int)__logbw;
|
||||
__c = _SCALBNf(__c, -__ilogbw);
|
||||
@@ -249,6 +273,8 @@ __DEVICE__ float _Complex __divsc3(float __a, float __b, float __c, float __d) {
|
||||
#undef _ABSf
|
||||
#undef _LOGBd
|
||||
#undef _LOGBf
|
||||
#undef _fmaxd
|
||||
#undef _fmaxf
|
||||
|
||||
#ifdef __OPENMP_NVPTX__
|
||||
#pragma omp end declare target
|
||||
|
||||
Vendored
+5
-4
@@ -195,8 +195,8 @@ __DEVICE__ int max(int __a, int __b) { return __nv_max(__a, __b); }
|
||||
__DEVICE__ int min(int __a, int __b) { return __nv_min(__a, __b); }
|
||||
__DEVICE__ double modf(double __a, double *__b) { return __nv_modf(__a, __b); }
|
||||
__DEVICE__ float modff(float __a, float *__b) { return __nv_modff(__a, __b); }
|
||||
__DEVICE__ double nearbyint(double __a) { return __nv_nearbyint(__a); }
|
||||
__DEVICE__ float nearbyintf(float __a) { return __nv_nearbyintf(__a); }
|
||||
__DEVICE__ double nearbyint(double __a) { return __builtin_nearbyint(__a); }
|
||||
__DEVICE__ float nearbyintf(float __a) { return __builtin_nearbyintf(__a); }
|
||||
__DEVICE__ double nextafter(double __a, double __b) {
|
||||
return __nv_nextafter(__a, __b);
|
||||
}
|
||||
@@ -249,8 +249,9 @@ __DEVICE__ double rhypot(double __a, double __b) {
|
||||
__DEVICE__ float rhypotf(float __a, float __b) {
|
||||
return __nv_rhypotf(__a, __b);
|
||||
}
|
||||
__DEVICE__ double rint(double __a) { return __nv_rint(__a); }
|
||||
__DEVICE__ float rintf(float __a) { return __nv_rintf(__a); }
|
||||
// __nv_rint* in libdevice is buggy and produces incorrect results.
|
||||
__DEVICE__ double rint(double __a) { return __builtin_rint(__a); }
|
||||
__DEVICE__ float rintf(float __a) { return __builtin_rintf(__a); }
|
||||
__DEVICE__ double rnorm(int __a, const double *__b) {
|
||||
return __nv_rnorm(__a, __b);
|
||||
}
|
||||
|
||||
+18
-10
@@ -377,30 +377,38 @@ __device__ static inline void *malloc(size_t __size) {
|
||||
// Out-of-line implementations from __clang_cuda_builtin_vars.h. These need to
|
||||
// come after we've pulled in the definition of uint3 and dim3.
|
||||
|
||||
__device__ inline __cuda_builtin_threadIdx_t::operator dim3() const {
|
||||
return dim3(x, y, z);
|
||||
}
|
||||
|
||||
__device__ inline __cuda_builtin_threadIdx_t::operator uint3() const {
|
||||
uint3 ret;
|
||||
ret.x = x;
|
||||
ret.y = y;
|
||||
ret.z = z;
|
||||
return ret;
|
||||
return {x, y, z};
|
||||
}
|
||||
|
||||
__device__ inline __cuda_builtin_blockIdx_t::operator dim3() const {
|
||||
return dim3(x, y, z);
|
||||
}
|
||||
|
||||
__device__ inline __cuda_builtin_blockIdx_t::operator uint3() const {
|
||||
uint3 ret;
|
||||
ret.x = x;
|
||||
ret.y = y;
|
||||
ret.z = z;
|
||||
return ret;
|
||||
return {x, y, z};
|
||||
}
|
||||
|
||||
__device__ inline __cuda_builtin_blockDim_t::operator dim3() const {
|
||||
return dim3(x, y, z);
|
||||
}
|
||||
|
||||
__device__ inline __cuda_builtin_blockDim_t::operator uint3() const {
|
||||
return {x, y, z};
|
||||
}
|
||||
|
||||
__device__ inline __cuda_builtin_gridDim_t::operator dim3() const {
|
||||
return dim3(x, y, z);
|
||||
}
|
||||
|
||||
__device__ inline __cuda_builtin_gridDim_t::operator uint3() const {
|
||||
return {x, y, z};
|
||||
}
|
||||
|
||||
#include <__clang_cuda_cmath.h>
|
||||
#include <__clang_cuda_intrinsics.h>
|
||||
#include <__clang_cuda_complex_builtins.h>
|
||||
|
||||
Vendored
+629
@@ -0,0 +1,629 @@
|
||||
/*===---- __clang_hip_cmath.h - HIP cmath decls -----------------------------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __CLANG_HIP_CMATH_H__
|
||||
#define __CLANG_HIP_CMATH_H__
|
||||
|
||||
#if !defined(__HIP__)
|
||||
#error "This file is for HIP and OpenMP AMDGCN device compilation only."
|
||||
#endif
|
||||
|
||||
#if defined(__cplusplus)
|
||||
#include <limits>
|
||||
#include <type_traits>
|
||||
#include <utility>
|
||||
#endif
|
||||
#include <limits.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#pragma push_macro("__DEVICE__")
|
||||
#define __DEVICE__ static __device__ inline __attribute__((always_inline))
|
||||
|
||||
// Start with functions that cannot be defined by DEF macros below.
|
||||
#if defined(__cplusplus)
|
||||
__DEVICE__ double abs(double __x) { return ::fabs(__x); }
|
||||
__DEVICE__ float abs(float __x) { return ::fabsf(__x); }
|
||||
__DEVICE__ long long abs(long long __n) { return ::llabs(__n); }
|
||||
__DEVICE__ long abs(long __n) { return ::labs(__n); }
|
||||
__DEVICE__ float fma(float __x, float __y, float __z) {
|
||||
return ::fmaf(__x, __y, __z);
|
||||
}
|
||||
__DEVICE__ int fpclassify(float __x) {
|
||||
return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
|
||||
FP_ZERO, __x);
|
||||
}
|
||||
__DEVICE__ int fpclassify(double __x) {
|
||||
return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
|
||||
FP_ZERO, __x);
|
||||
}
|
||||
__DEVICE__ float frexp(float __arg, int *__exp) {
|
||||
return ::frexpf(__arg, __exp);
|
||||
}
|
||||
__DEVICE__ bool isfinite(float __x) { return ::__finitef(__x); }
|
||||
__DEVICE__ bool isfinite(double __x) { return ::__finite(__x); }
|
||||
__DEVICE__ bool isgreater(float __x, float __y) {
|
||||
return __builtin_isgreater(__x, __y);
|
||||
}
|
||||
__DEVICE__ bool isgreater(double __x, double __y) {
|
||||
return __builtin_isgreater(__x, __y);
|
||||
}
|
||||
__DEVICE__ bool isgreaterequal(float __x, float __y) {
|
||||
return __builtin_isgreaterequal(__x, __y);
|
||||
}
|
||||
__DEVICE__ bool isgreaterequal(double __x, double __y) {
|
||||
return __builtin_isgreaterequal(__x, __y);
|
||||
}
|
||||
__DEVICE__ bool isinf(float __x) { return ::__isinff(__x); }
|
||||
__DEVICE__ bool isinf(double __x) { return ::__isinf(__x); }
|
||||
__DEVICE__ bool isless(float __x, float __y) {
|
||||
return __builtin_isless(__x, __y);
|
||||
}
|
||||
__DEVICE__ bool isless(double __x, double __y) {
|
||||
return __builtin_isless(__x, __y);
|
||||
}
|
||||
__DEVICE__ bool islessequal(float __x, float __y) {
|
||||
return __builtin_islessequal(__x, __y);
|
||||
}
|
||||
__DEVICE__ bool islessequal(double __x, double __y) {
|
||||
return __builtin_islessequal(__x, __y);
|
||||
}
|
||||
__DEVICE__ bool islessgreater(float __x, float __y) {
|
||||
return __builtin_islessgreater(__x, __y);
|
||||
}
|
||||
__DEVICE__ bool islessgreater(double __x, double __y) {
|
||||
return __builtin_islessgreater(__x, __y);
|
||||
}
|
||||
__DEVICE__ bool isnan(float __x) { return ::__isnanf(__x); }
|
||||
__DEVICE__ bool isnan(double __x) { return ::__isnan(__x); }
|
||||
__DEVICE__ bool isnormal(float __x) { return __builtin_isnormal(__x); }
|
||||
__DEVICE__ bool isnormal(double __x) { return __builtin_isnormal(__x); }
|
||||
__DEVICE__ bool isunordered(float __x, float __y) {
|
||||
return __builtin_isunordered(__x, __y);
|
||||
}
|
||||
__DEVICE__ bool isunordered(double __x, double __y) {
|
||||
return __builtin_isunordered(__x, __y);
|
||||
}
|
||||
__DEVICE__ float modf(float __x, float *__iptr) { return ::modff(__x, __iptr); }
|
||||
__DEVICE__ float pow(float __base, int __iexp) {
|
||||
return ::powif(__base, __iexp);
|
||||
}
|
||||
__DEVICE__ double pow(double __base, int __iexp) {
|
||||
return ::powi(__base, __iexp);
|
||||
}
|
||||
__DEVICE__ float remquo(float __x, float __y, int *__quo) {
|
||||
return ::remquof(__x, __y, __quo);
|
||||
}
|
||||
__DEVICE__ float scalbln(float __x, long int __n) {
|
||||
return ::scalblnf(__x, __n);
|
||||
}
|
||||
__DEVICE__ bool signbit(float __x) { return ::__signbitf(__x); }
|
||||
__DEVICE__ bool signbit(double __x) { return ::__signbit(__x); }
|
||||
|
||||
// Notably missing above is nexttoward. We omit it because
|
||||
// ocml doesn't provide an implementation, and we don't want to be in the
|
||||
// business of implementing tricky libm functions in this header.
|
||||
|
||||
// Other functions.
|
||||
__DEVICE__ _Float16 fma(_Float16 __x, _Float16 __y, _Float16 __z) {
|
||||
return __ocml_fma_f16(__x, __y, __z);
|
||||
}
|
||||
__DEVICE__ _Float16 pow(_Float16 __base, int __iexp) {
|
||||
return __ocml_pown_f16(__base, __iexp);
|
||||
}
|
||||
|
||||
// BEGIN DEF_FUN and HIP_OVERLOAD
|
||||
|
||||
// BEGIN DEF_FUN
|
||||
|
||||
#pragma push_macro("__DEF_FUN1")
|
||||
#pragma push_macro("__DEF_FUN2")
|
||||
#pragma push_macro("__DEF_FUN2_FI")
|
||||
|
||||
// Define cmath functions with float argument and returns __retty.
|
||||
#define __DEF_FUN1(__retty, __func) \
|
||||
__DEVICE__ \
|
||||
__retty __func(float __x) { return __func##f(__x); }
|
||||
|
||||
// Define cmath functions with two float arguments and returns __retty.
|
||||
#define __DEF_FUN2(__retty, __func) \
|
||||
__DEVICE__ \
|
||||
__retty __func(float __x, float __y) { return __func##f(__x, __y); }
|
||||
|
||||
// Define cmath functions with a float and an int argument and returns __retty.
|
||||
#define __DEF_FUN2_FI(__retty, __func) \
|
||||
__DEVICE__ \
|
||||
__retty __func(float __x, int __y) { return __func##f(__x, __y); }
|
||||
|
||||
__DEF_FUN1(float, acos)
|
||||
__DEF_FUN1(float, acosh)
|
||||
__DEF_FUN1(float, asin)
|
||||
__DEF_FUN1(float, asinh)
|
||||
__DEF_FUN1(float, atan)
|
||||
__DEF_FUN2(float, atan2)
|
||||
__DEF_FUN1(float, atanh)
|
||||
__DEF_FUN1(float, cbrt)
|
||||
__DEF_FUN1(float, ceil)
|
||||
__DEF_FUN2(float, copysign)
|
||||
__DEF_FUN1(float, cos)
|
||||
__DEF_FUN1(float, cosh)
|
||||
__DEF_FUN1(float, erf)
|
||||
__DEF_FUN1(float, erfc)
|
||||
__DEF_FUN1(float, exp)
|
||||
__DEF_FUN1(float, exp2)
|
||||
__DEF_FUN1(float, expm1)
|
||||
__DEF_FUN1(float, fabs)
|
||||
__DEF_FUN2(float, fdim)
|
||||
__DEF_FUN1(float, floor)
|
||||
__DEF_FUN2(float, fmax)
|
||||
__DEF_FUN2(float, fmin)
|
||||
__DEF_FUN2(float, fmod)
|
||||
__DEF_FUN2(float, hypot)
|
||||
__DEF_FUN1(int, ilogb)
|
||||
__DEF_FUN2_FI(float, ldexp)
|
||||
__DEF_FUN1(float, lgamma)
|
||||
__DEF_FUN1(float, log)
|
||||
__DEF_FUN1(float, log10)
|
||||
__DEF_FUN1(float, log1p)
|
||||
__DEF_FUN1(float, log2)
|
||||
__DEF_FUN1(float, logb)
|
||||
__DEF_FUN1(long long, llrint)
|
||||
__DEF_FUN1(long long, llround)
|
||||
__DEF_FUN1(long, lrint)
|
||||
__DEF_FUN1(long, lround)
|
||||
__DEF_FUN1(float, nearbyint)
|
||||
__DEF_FUN2(float, nextafter)
|
||||
__DEF_FUN2(float, pow)
|
||||
__DEF_FUN2(float, remainder)
|
||||
__DEF_FUN1(float, rint)
|
||||
__DEF_FUN1(float, round)
|
||||
__DEF_FUN2_FI(float, scalbn)
|
||||
__DEF_FUN1(float, sin)
|
||||
__DEF_FUN1(float, sinh)
|
||||
__DEF_FUN1(float, sqrt)
|
||||
__DEF_FUN1(float, tan)
|
||||
__DEF_FUN1(float, tanh)
|
||||
__DEF_FUN1(float, tgamma)
|
||||
__DEF_FUN1(float, trunc)
|
||||
|
||||
#pragma pop_macro("__DEF_FUN1")
|
||||
#pragma pop_macro("__DEF_FUN2")
|
||||
#pragma pop_macro("__DEF_FUN2_FI")
|
||||
|
||||
// END DEF_FUN
|
||||
|
||||
// BEGIN HIP_OVERLOAD
|
||||
|
||||
#pragma push_macro("__HIP_OVERLOAD1")
|
||||
#pragma push_macro("__HIP_OVERLOAD2")
|
||||
|
||||
// __hip_enable_if::type is a type function which returns __T if __B is true.
|
||||
template <bool __B, class __T = void> struct __hip_enable_if {};
|
||||
|
||||
template <class __T> struct __hip_enable_if<true, __T> { typedef __T type; };
|
||||
|
||||
// decltype is only available in C++11 and above.
|
||||
#if __cplusplus >= 201103L
|
||||
// __hip_promote
|
||||
namespace __hip {
|
||||
|
||||
template <class _Tp> struct __numeric_type {
|
||||
static void __test(...);
|
||||
static _Float16 __test(_Float16);
|
||||
static float __test(float);
|
||||
static double __test(char);
|
||||
static double __test(int);
|
||||
static double __test(unsigned);
|
||||
static double __test(long);
|
||||
static double __test(unsigned long);
|
||||
static double __test(long long);
|
||||
static double __test(unsigned long long);
|
||||
static double __test(double);
|
||||
// No support for long double, use double instead.
|
||||
static double __test(long double);
|
||||
|
||||
typedef decltype(__test(std::declval<_Tp>())) type;
|
||||
static const bool value = !std::is_same<type, void>::value;
|
||||
};
|
||||
|
||||
template <> struct __numeric_type<void> { static const bool value = true; };
|
||||
|
||||
template <class _A1, class _A2 = void, class _A3 = void,
|
||||
bool = __numeric_type<_A1>::value &&__numeric_type<_A2>::value
|
||||
&&__numeric_type<_A3>::value>
|
||||
class __promote_imp {
|
||||
public:
|
||||
static const bool value = false;
|
||||
};
|
||||
|
||||
template <class _A1, class _A2, class _A3>
|
||||
class __promote_imp<_A1, _A2, _A3, true> {
|
||||
private:
|
||||
typedef typename __promote_imp<_A1>::type __type1;
|
||||
typedef typename __promote_imp<_A2>::type __type2;
|
||||
typedef typename __promote_imp<_A3>::type __type3;
|
||||
|
||||
public:
|
||||
typedef decltype(__type1() + __type2() + __type3()) type;
|
||||
static const bool value = true;
|
||||
};
|
||||
|
||||
template <class _A1, class _A2> class __promote_imp<_A1, _A2, void, true> {
|
||||
private:
|
||||
typedef typename __promote_imp<_A1>::type __type1;
|
||||
typedef typename __promote_imp<_A2>::type __type2;
|
||||
|
||||
public:
|
||||
typedef decltype(__type1() + __type2()) type;
|
||||
static const bool value = true;
|
||||
};
|
||||
|
||||
template <class _A1> class __promote_imp<_A1, void, void, true> {
|
||||
public:
|
||||
typedef typename __numeric_type<_A1>::type type;
|
||||
static const bool value = true;
|
||||
};
|
||||
|
||||
template <class _A1, class _A2 = void, class _A3 = void>
|
||||
class __promote : public __promote_imp<_A1, _A2, _A3> {};
|
||||
|
||||
} // namespace __hip
|
||||
#endif //__cplusplus >= 201103L
|
||||
|
||||
// __HIP_OVERLOAD1 is used to resolve function calls with integer argument to
|
||||
// avoid compilation error due to ambibuity. e.g. floor(5) is resolved with
|
||||
// floor(double).
|
||||
#define __HIP_OVERLOAD1(__retty, __fn) \
|
||||
template <typename __T> \
|
||||
__DEVICE__ typename __hip_enable_if<std::numeric_limits<__T>::is_integer, \
|
||||
__retty>::type \
|
||||
__fn(__T __x) { \
|
||||
return ::__fn((double)__x); \
|
||||
}
|
||||
|
||||
// __HIP_OVERLOAD2 is used to resolve function calls with mixed float/double
|
||||
// or integer argument to avoid compilation error due to ambibuity. e.g.
|
||||
// max(5.0f, 6.0) is resolved with max(double, double).
|
||||
#if __cplusplus >= 201103L
|
||||
#define __HIP_OVERLOAD2(__retty, __fn) \
|
||||
template <typename __T1, typename __T2> \
|
||||
__DEVICE__ typename __hip_enable_if< \
|
||||
std::numeric_limits<__T1>::is_specialized && \
|
||||
std::numeric_limits<__T2>::is_specialized, \
|
||||
typename __hip::__promote<__T1, __T2>::type>::type \
|
||||
__fn(__T1 __x, __T2 __y) { \
|
||||
typedef typename __hip::__promote<__T1, __T2>::type __result_type; \
|
||||
return __fn((__result_type)__x, (__result_type)__y); \
|
||||
}
|
||||
#else
|
||||
#define __HIP_OVERLOAD2(__retty, __fn) \
|
||||
template <typename __T1, typename __T2> \
|
||||
__DEVICE__ \
|
||||
typename __hip_enable_if<std::numeric_limits<__T1>::is_specialized && \
|
||||
std::numeric_limits<__T2>::is_specialized, \
|
||||
__retty>::type \
|
||||
__fn(__T1 __x, __T2 __y) { \
|
||||
return __fn((double)__x, (double)__y); \
|
||||
}
|
||||
#endif
|
||||
|
||||
__HIP_OVERLOAD1(double, abs)
|
||||
__HIP_OVERLOAD1(double, acos)
|
||||
__HIP_OVERLOAD1(double, acosh)
|
||||
__HIP_OVERLOAD1(double, asin)
|
||||
__HIP_OVERLOAD1(double, asinh)
|
||||
__HIP_OVERLOAD1(double, atan)
|
||||
__HIP_OVERLOAD2(double, atan2)
|
||||
__HIP_OVERLOAD1(double, atanh)
|
||||
__HIP_OVERLOAD1(double, cbrt)
|
||||
__HIP_OVERLOAD1(double, ceil)
|
||||
__HIP_OVERLOAD2(double, copysign)
|
||||
__HIP_OVERLOAD1(double, cos)
|
||||
__HIP_OVERLOAD1(double, cosh)
|
||||
__HIP_OVERLOAD1(double, erf)
|
||||
__HIP_OVERLOAD1(double, erfc)
|
||||
__HIP_OVERLOAD1(double, exp)
|
||||
__HIP_OVERLOAD1(double, exp2)
|
||||
__HIP_OVERLOAD1(double, expm1)
|
||||
__HIP_OVERLOAD1(double, fabs)
|
||||
__HIP_OVERLOAD2(double, fdim)
|
||||
__HIP_OVERLOAD1(double, floor)
|
||||
__HIP_OVERLOAD2(double, fmax)
|
||||
__HIP_OVERLOAD2(double, fmin)
|
||||
__HIP_OVERLOAD2(double, fmod)
|
||||
__HIP_OVERLOAD1(int, fpclassify)
|
||||
__HIP_OVERLOAD2(double, hypot)
|
||||
__HIP_OVERLOAD1(int, ilogb)
|
||||
__HIP_OVERLOAD1(bool, isfinite)
|
||||
__HIP_OVERLOAD2(bool, isgreater)
|
||||
__HIP_OVERLOAD2(bool, isgreaterequal)
|
||||
__HIP_OVERLOAD1(bool, isinf)
|
||||
__HIP_OVERLOAD2(bool, isless)
|
||||
__HIP_OVERLOAD2(bool, islessequal)
|
||||
__HIP_OVERLOAD2(bool, islessgreater)
|
||||
__HIP_OVERLOAD1(bool, isnan)
|
||||
__HIP_OVERLOAD1(bool, isnormal)
|
||||
__HIP_OVERLOAD2(bool, isunordered)
|
||||
__HIP_OVERLOAD1(double, lgamma)
|
||||
__HIP_OVERLOAD1(double, log)
|
||||
__HIP_OVERLOAD1(double, log10)
|
||||
__HIP_OVERLOAD1(double, log1p)
|
||||
__HIP_OVERLOAD1(double, log2)
|
||||
__HIP_OVERLOAD1(double, logb)
|
||||
__HIP_OVERLOAD1(long long, llrint)
|
||||
__HIP_OVERLOAD1(long long, llround)
|
||||
__HIP_OVERLOAD1(long, lrint)
|
||||
__HIP_OVERLOAD1(long, lround)
|
||||
__HIP_OVERLOAD1(double, nearbyint)
|
||||
__HIP_OVERLOAD2(double, nextafter)
|
||||
__HIP_OVERLOAD2(double, pow)
|
||||
__HIP_OVERLOAD2(double, remainder)
|
||||
__HIP_OVERLOAD1(double, rint)
|
||||
__HIP_OVERLOAD1(double, round)
|
||||
__HIP_OVERLOAD1(bool, signbit)
|
||||
__HIP_OVERLOAD1(double, sin)
|
||||
__HIP_OVERLOAD1(double, sinh)
|
||||
__HIP_OVERLOAD1(double, sqrt)
|
||||
__HIP_OVERLOAD1(double, tan)
|
||||
__HIP_OVERLOAD1(double, tanh)
|
||||
__HIP_OVERLOAD1(double, tgamma)
|
||||
__HIP_OVERLOAD1(double, trunc)
|
||||
|
||||
// Overload these but don't add them to std, they are not part of cmath.
|
||||
__HIP_OVERLOAD2(double, max)
|
||||
__HIP_OVERLOAD2(double, min)
|
||||
|
||||
// Additional Overloads that don't quite match HIP_OVERLOAD.
|
||||
#if __cplusplus >= 201103L
|
||||
template <typename __T1, typename __T2, typename __T3>
|
||||
__DEVICE__ typename __hip_enable_if<
|
||||
std::numeric_limits<__T1>::is_specialized &&
|
||||
std::numeric_limits<__T2>::is_specialized &&
|
||||
std::numeric_limits<__T3>::is_specialized,
|
||||
typename __hip::__promote<__T1, __T2, __T3>::type>::type
|
||||
fma(__T1 __x, __T2 __y, __T3 __z) {
|
||||
typedef typename __hip::__promote<__T1, __T2, __T3>::type __result_type;
|
||||
return ::fma((__result_type)__x, (__result_type)__y, (__result_type)__z);
|
||||
}
|
||||
#else
|
||||
template <typename __T1, typename __T2, typename __T3>
|
||||
__DEVICE__
|
||||
typename __hip_enable_if<std::numeric_limits<__T1>::is_specialized &&
|
||||
std::numeric_limits<__T2>::is_specialized &&
|
||||
std::numeric_limits<__T3>::is_specialized,
|
||||
double>::type
|
||||
fma(__T1 __x, __T2 __y, __T3 __z) {
|
||||
return ::fma((double)__x, (double)__y, (double)__z);
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename __T>
|
||||
__DEVICE__
|
||||
typename __hip_enable_if<std::numeric_limits<__T>::is_integer, double>::type
|
||||
frexp(__T __x, int *__exp) {
|
||||
return ::frexp((double)__x, __exp);
|
||||
}
|
||||
|
||||
template <typename __T>
|
||||
__DEVICE__
|
||||
typename __hip_enable_if<std::numeric_limits<__T>::is_integer, double>::type
|
||||
ldexp(__T __x, int __exp) {
|
||||
return ::ldexp((double)__x, __exp);
|
||||
}
|
||||
|
||||
template <typename __T>
|
||||
__DEVICE__
|
||||
typename __hip_enable_if<std::numeric_limits<__T>::is_integer, double>::type
|
||||
modf(__T __x, double *__exp) {
|
||||
return ::modf((double)__x, __exp);
|
||||
}
|
||||
|
||||
#if __cplusplus >= 201103L
|
||||
template <typename __T1, typename __T2>
|
||||
__DEVICE__
|
||||
typename __hip_enable_if<std::numeric_limits<__T1>::is_specialized &&
|
||||
std::numeric_limits<__T2>::is_specialized,
|
||||
typename __hip::__promote<__T1, __T2>::type>::type
|
||||
remquo(__T1 __x, __T2 __y, int *__quo) {
|
||||
typedef typename __hip::__promote<__T1, __T2>::type __result_type;
|
||||
return ::remquo((__result_type)__x, (__result_type)__y, __quo);
|
||||
}
|
||||
#else
|
||||
template <typename __T1, typename __T2>
|
||||
__DEVICE__
|
||||
typename __hip_enable_if<std::numeric_limits<__T1>::is_specialized &&
|
||||
std::numeric_limits<__T2>::is_specialized,
|
||||
double>::type
|
||||
remquo(__T1 __x, __T2 __y, int *__quo) {
|
||||
return ::remquo((double)__x, (double)__y, __quo);
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename __T>
|
||||
__DEVICE__
|
||||
typename __hip_enable_if<std::numeric_limits<__T>::is_integer, double>::type
|
||||
scalbln(__T __x, long int __exp) {
|
||||
return ::scalbln((double)__x, __exp);
|
||||
}
|
||||
|
||||
template <typename __T>
|
||||
__DEVICE__
|
||||
typename __hip_enable_if<std::numeric_limits<__T>::is_integer, double>::type
|
||||
scalbn(__T __x, int __exp) {
|
||||
return ::scalbn((double)__x, __exp);
|
||||
}
|
||||
|
||||
#pragma pop_macro("__HIP_OVERLOAD1")
|
||||
#pragma pop_macro("__HIP_OVERLOAD2")
|
||||
|
||||
// END HIP_OVERLOAD
|
||||
|
||||
// END DEF_FUN and HIP_OVERLOAD
|
||||
|
||||
#endif // defined(__cplusplus)
|
||||
|
||||
// Define these overloads inside the namespace our standard library uses.
|
||||
#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
|
||||
_LIBCPP_BEGIN_NAMESPACE_STD
|
||||
#else
|
||||
namespace std {
|
||||
#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
|
||||
_GLIBCXX_BEGIN_NAMESPACE_VERSION
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Pull the new overloads we defined above into namespace std.
|
||||
// using ::abs; - This may be considered for C++.
|
||||
using ::acos;
|
||||
using ::acosh;
|
||||
using ::asin;
|
||||
using ::asinh;
|
||||
using ::atan;
|
||||
using ::atan2;
|
||||
using ::atanh;
|
||||
using ::cbrt;
|
||||
using ::ceil;
|
||||
using ::copysign;
|
||||
using ::cos;
|
||||
using ::cosh;
|
||||
using ::erf;
|
||||
using ::erfc;
|
||||
using ::exp;
|
||||
using ::exp2;
|
||||
using ::expm1;
|
||||
using ::fabs;
|
||||
using ::fdim;
|
||||
using ::floor;
|
||||
using ::fma;
|
||||
using ::fmax;
|
||||
using ::fmin;
|
||||
using ::fmod;
|
||||
using ::fpclassify;
|
||||
using ::frexp;
|
||||
using ::hypot;
|
||||
using ::ilogb;
|
||||
using ::isfinite;
|
||||
using ::isgreater;
|
||||
using ::isgreaterequal;
|
||||
using ::isless;
|
||||
using ::islessequal;
|
||||
using ::islessgreater;
|
||||
using ::isnormal;
|
||||
using ::isunordered;
|
||||
using ::ldexp;
|
||||
using ::lgamma;
|
||||
using ::llrint;
|
||||
using ::llround;
|
||||
using ::log;
|
||||
using ::log10;
|
||||
using ::log1p;
|
||||
using ::log2;
|
||||
using ::logb;
|
||||
using ::lrint;
|
||||
using ::lround;
|
||||
using ::modf;
|
||||
// using ::nan; - This may be considered for C++.
|
||||
// using ::nanf; - This may be considered for C++.
|
||||
// using ::nanl; - This is not yet defined.
|
||||
using ::nearbyint;
|
||||
using ::nextafter;
|
||||
// using ::nexttoward; - Omit this since we do not have a definition.
|
||||
using ::pow;
|
||||
using ::remainder;
|
||||
using ::remquo;
|
||||
using ::rint;
|
||||
using ::round;
|
||||
using ::scalbln;
|
||||
using ::scalbn;
|
||||
using ::signbit;
|
||||
using ::sin;
|
||||
using ::sinh;
|
||||
using ::sqrt;
|
||||
using ::tan;
|
||||
using ::tanh;
|
||||
using ::tgamma;
|
||||
using ::trunc;
|
||||
|
||||
// Well this is fun: We need to pull these symbols in for libc++, but we can't
|
||||
// pull them in with libstdc++, because its ::isinf and ::isnan are different
|
||||
// than its std::isinf and std::isnan.
|
||||
#ifndef __GLIBCXX__
|
||||
using ::isinf;
|
||||
using ::isnan;
|
||||
#endif
|
||||
|
||||
// Finally, pull the "foobarf" functions that HIP defines into std.
|
||||
using ::acosf;
|
||||
using ::acoshf;
|
||||
using ::asinf;
|
||||
using ::asinhf;
|
||||
using ::atan2f;
|
||||
using ::atanf;
|
||||
using ::atanhf;
|
||||
using ::cbrtf;
|
||||
using ::ceilf;
|
||||
using ::copysignf;
|
||||
using ::cosf;
|
||||
using ::coshf;
|
||||
using ::erfcf;
|
||||
using ::erff;
|
||||
using ::exp2f;
|
||||
using ::expf;
|
||||
using ::expm1f;
|
||||
using ::fabsf;
|
||||
using ::fdimf;
|
||||
using ::floorf;
|
||||
using ::fmaf;
|
||||
using ::fmaxf;
|
||||
using ::fminf;
|
||||
using ::fmodf;
|
||||
using ::frexpf;
|
||||
using ::hypotf;
|
||||
using ::ilogbf;
|
||||
using ::ldexpf;
|
||||
using ::lgammaf;
|
||||
using ::llrintf;
|
||||
using ::llroundf;
|
||||
using ::log10f;
|
||||
using ::log1pf;
|
||||
using ::log2f;
|
||||
using ::logbf;
|
||||
using ::logf;
|
||||
using ::lrintf;
|
||||
using ::lroundf;
|
||||
using ::modff;
|
||||
using ::nearbyintf;
|
||||
using ::nextafterf;
|
||||
// using ::nexttowardf; - Omit this since we do not have a definition.
|
||||
using ::powf;
|
||||
using ::remainderf;
|
||||
using ::remquof;
|
||||
using ::rintf;
|
||||
using ::roundf;
|
||||
using ::scalblnf;
|
||||
using ::scalbnf;
|
||||
using ::sinf;
|
||||
using ::sinhf;
|
||||
using ::sqrtf;
|
||||
using ::tanf;
|
||||
using ::tanhf;
|
||||
using ::tgammaf;
|
||||
using ::truncf;
|
||||
|
||||
#ifdef _LIBCPP_END_NAMESPACE_STD
|
||||
_LIBCPP_END_NAMESPACE_STD
|
||||
#else
|
||||
#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
|
||||
_GLIBCXX_END_NAMESPACE_VERSION
|
||||
#endif
|
||||
} // namespace std
|
||||
#endif
|
||||
|
||||
#pragma pop_macro("__DEVICE__")
|
||||
|
||||
#endif // __CLANG_HIP_CMATH_H__
|
||||
+17
-9
@@ -10,7 +10,9 @@
|
||||
#ifndef __CLANG_HIP_LIBDEVICE_DECLARES_H__
|
||||
#define __CLANG_HIP_LIBDEVICE_DECLARES_H__
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// BEGIN FLOAT
|
||||
__device__ __attribute__((const)) float __ocml_acos_f32(float);
|
||||
@@ -78,6 +80,7 @@ __device__ __attribute__((const)) float __ocml_len4_f32(float, float, float,
|
||||
__device__ __attribute__((pure)) float __ocml_ncdf_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_ncdfinv_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_pow_f32(float, float);
|
||||
__device__ __attribute__((pure)) float __ocml_pown_f32(float, int);
|
||||
__device__ __attribute__((pure)) float __ocml_rcbrt_f32(float);
|
||||
__device__ __attribute__((const)) float __ocml_remainder_f32(float, float);
|
||||
__device__ float __ocml_remquo_f32(float, float,
|
||||
@@ -126,10 +129,10 @@ __device__ __attribute__((const)) float __ocml_div_rte_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_div_rtn_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_div_rtp_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_div_rtz_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_sqrt_rte_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_sqrt_rtn_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_sqrt_rtp_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_sqrt_rtz_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_sqrt_rte_f32(float);
|
||||
__device__ __attribute__((const)) float __ocml_sqrt_rtn_f32(float);
|
||||
__device__ __attribute__((const)) float __ocml_sqrt_rtp_f32(float);
|
||||
__device__ __attribute__((const)) float __ocml_sqrt_rtz_f32(float);
|
||||
__device__ __attribute__((const)) float __ocml_fma_rte_f32(float, float, float);
|
||||
__device__ __attribute__((const)) float __ocml_fma_rtn_f32(float, float, float);
|
||||
__device__ __attribute__((const)) float __ocml_fma_rtp_f32(float, float, float);
|
||||
@@ -205,6 +208,7 @@ __device__ __attribute__((const)) double __ocml_len4_f64(double, double, double,
|
||||
__device__ __attribute__((pure)) double __ocml_ncdf_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_ncdfinv_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_pow_f64(double, double);
|
||||
__device__ __attribute__((pure)) double __ocml_pown_f64(double, int);
|
||||
__device__ __attribute__((pure)) double __ocml_rcbrt_f64(double);
|
||||
__device__ __attribute__((const)) double __ocml_remainder_f64(double, double);
|
||||
__device__ double __ocml_remquo_f64(double, double,
|
||||
@@ -252,10 +256,10 @@ __device__ __attribute__((const)) double __ocml_div_rte_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_div_rtn_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_div_rtp_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_div_rtz_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_sqrt_rte_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_sqrt_rtn_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_sqrt_rtp_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_sqrt_rtz_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_sqrt_rte_f64(double);
|
||||
__device__ __attribute__((const)) double __ocml_sqrt_rtn_f64(double);
|
||||
__device__ __attribute__((const)) double __ocml_sqrt_rtp_f64(double);
|
||||
__device__ __attribute__((const)) double __ocml_sqrt_rtz_f64(double);
|
||||
__device__ __attribute__((const)) double __ocml_fma_rte_f64(double, double,
|
||||
double);
|
||||
__device__ __attribute__((const)) double __ocml_fma_rtn_f64(double, double,
|
||||
@@ -290,6 +294,7 @@ __device__ __attribute__((const)) _Float16 __ocml_rsqrt_f16(_Float16);
|
||||
__device__ _Float16 __ocml_sin_f16(_Float16);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_sqrt_f16(_Float16);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_trunc_f16(_Float16);
|
||||
__device__ __attribute__((pure)) _Float16 __ocml_pown_f16(_Float16, int);
|
||||
|
||||
typedef _Float16 __2f16 __attribute__((ext_vector_type(2)));
|
||||
typedef short __2i16 __attribute__((ext_vector_type(2)));
|
||||
@@ -313,14 +318,17 @@ __device__ __attribute__((pure)) __2f16 __ocml_log2_2f16(__2f16);
|
||||
__device__ inline __2f16
|
||||
__llvm_amdgcn_rcp_2f16(__2f16 __x) // Not currently exposed by ROCDL.
|
||||
{
|
||||
return __2f16{__llvm_amdgcn_rcp_f16(__x.x), __llvm_amdgcn_rcp_f16(__x.y)};
|
||||
return (__2f16)(__llvm_amdgcn_rcp_f16(__x.x), __llvm_amdgcn_rcp_f16(__x.y));
|
||||
}
|
||||
__device__ __attribute__((const)) __2f16 __ocml_rint_2f16(__2f16);
|
||||
__device__ __attribute__((const)) __2f16 __ocml_rsqrt_2f16(__2f16);
|
||||
__device__ __2f16 __ocml_sin_2f16(__2f16);
|
||||
__device__ __attribute__((const)) __2f16 __ocml_sqrt_2f16(__2f16);
|
||||
__device__ __attribute__((const)) __2f16 __ocml_trunc_2f16(__2f16);
|
||||
__device__ __attribute__((const)) __2f16 __ocml_pown_2f16(__2f16, __2i16);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // __CLANG_HIP_LIBDEVICE_DECLARES_H__
|
||||
|
||||
Vendored
+642
-552
@@ -1,4 +1,4 @@
|
||||
/*===---- __clang_hip_math.h - HIP math decls -------------------------------===
|
||||
/*===---- __clang_hip_math.h - Device-side HIP math support ----------------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
@@ -6,24 +6,57 @@
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __CLANG_HIP_MATH_H__
|
||||
#define __CLANG_HIP_MATH_H__
|
||||
|
||||
#if !defined(__HIP__)
|
||||
#error "This file is for HIP and OpenMP AMDGCN device compilation only."
|
||||
#endif
|
||||
|
||||
#if defined(__cplusplus)
|
||||
#include <algorithm>
|
||||
#endif
|
||||
#include <limits.h>
|
||||
#include <limits>
|
||||
#include <stdint.h>
|
||||
|
||||
#pragma push_macro("__DEVICE__")
|
||||
#pragma push_macro("__RETURN_TYPE")
|
||||
#define __DEVICE__ static __device__ inline __attribute__((always_inline))
|
||||
|
||||
// to be consistent with __clang_cuda_math_forward_declares
|
||||
#define __DEVICE__ static __device__
|
||||
// A few functions return bool type starting only in C++11.
|
||||
#pragma push_macro("__RETURN_TYPE")
|
||||
#if defined(__cplusplus)
|
||||
#define __RETURN_TYPE bool
|
||||
#else
|
||||
#define __RETURN_TYPE int
|
||||
#endif
|
||||
|
||||
#if defined (__cplusplus) && __cplusplus < 201103L
|
||||
// emulate static_assert on type sizes
|
||||
template<bool>
|
||||
struct __compare_result{};
|
||||
template<>
|
||||
struct __compare_result<true> {
|
||||
static const bool valid;
|
||||
};
|
||||
|
||||
__DEVICE__
|
||||
inline uint64_t __make_mantissa_base8(const char *__tagp) {
|
||||
void __suppress_unused_warning(bool b){};
|
||||
template <unsigned int S, unsigned int T>
|
||||
__DEVICE__ void __static_assert_equal_size() {
|
||||
__suppress_unused_warning(__compare_result<S == T>::valid);
|
||||
}
|
||||
|
||||
#define __static_assert_type_size_equal(A, B) \
|
||||
__static_assert_equal_size<A,B>()
|
||||
|
||||
#else
|
||||
#define __static_assert_type_size_equal(A,B) \
|
||||
static_assert((A) == (B), "")
|
||||
|
||||
#endif
|
||||
|
||||
__DEVICE__
|
||||
uint64_t __make_mantissa_base8(const char *__tagp) {
|
||||
uint64_t __r = 0;
|
||||
while (__tagp) {
|
||||
char __tmp = *__tagp;
|
||||
@@ -40,7 +73,7 @@ inline uint64_t __make_mantissa_base8(const char *__tagp) {
|
||||
}
|
||||
|
||||
__DEVICE__
|
||||
inline uint64_t __make_mantissa_base10(const char *__tagp) {
|
||||
uint64_t __make_mantissa_base10(const char *__tagp) {
|
||||
uint64_t __r = 0;
|
||||
while (__tagp) {
|
||||
char __tmp = *__tagp;
|
||||
@@ -57,7 +90,7 @@ inline uint64_t __make_mantissa_base10(const char *__tagp) {
|
||||
}
|
||||
|
||||
__DEVICE__
|
||||
inline uint64_t __make_mantissa_base16(const char *__tagp) {
|
||||
uint64_t __make_mantissa_base16(const char *__tagp) {
|
||||
uint64_t __r = 0;
|
||||
while (__tagp) {
|
||||
char __tmp = *__tagp;
|
||||
@@ -78,7 +111,7 @@ inline uint64_t __make_mantissa_base16(const char *__tagp) {
|
||||
}
|
||||
|
||||
__DEVICE__
|
||||
inline uint64_t __make_mantissa(const char *__tagp) {
|
||||
uint64_t __make_mantissa(const char *__tagp) {
|
||||
if (!__tagp)
|
||||
return 0u;
|
||||
|
||||
@@ -95,78 +128,124 @@ inline uint64_t __make_mantissa(const char *__tagp) {
|
||||
}
|
||||
|
||||
// BEGIN FLOAT
|
||||
#if defined(__cplusplus)
|
||||
__DEVICE__
|
||||
inline float abs(float __x) { return __ocml_fabs_f32(__x); }
|
||||
__DEVICE__
|
||||
inline float acosf(float __x) { return __ocml_acos_f32(__x); }
|
||||
__DEVICE__
|
||||
inline float acoshf(float __x) { return __ocml_acosh_f32(__x); }
|
||||
__DEVICE__
|
||||
inline float asinf(float __x) { return __ocml_asin_f32(__x); }
|
||||
__DEVICE__
|
||||
inline float asinhf(float __x) { return __ocml_asinh_f32(__x); }
|
||||
__DEVICE__
|
||||
inline float atan2f(float __x, float __y) { return __ocml_atan2_f32(__x, __y); }
|
||||
__DEVICE__
|
||||
inline float atanf(float __x) { return __ocml_atan_f32(__x); }
|
||||
__DEVICE__
|
||||
inline float atanhf(float __x) { return __ocml_atanh_f32(__x); }
|
||||
__DEVICE__
|
||||
inline float cbrtf(float __x) { return __ocml_cbrt_f32(__x); }
|
||||
__DEVICE__
|
||||
inline float ceilf(float __x) { return __ocml_ceil_f32(__x); }
|
||||
__DEVICE__
|
||||
inline float copysignf(float __x, float __y) {
|
||||
return __ocml_copysign_f32(__x, __y);
|
||||
int abs(int __x) {
|
||||
int __sgn = __x >> (sizeof(int) * CHAR_BIT - 1);
|
||||
return (__x ^ __sgn) - __sgn;
|
||||
}
|
||||
__DEVICE__
|
||||
inline float cosf(float __x) { return __ocml_cos_f32(__x); }
|
||||
long labs(long __x) {
|
||||
long __sgn = __x >> (sizeof(long) * CHAR_BIT - 1);
|
||||
return (__x ^ __sgn) - __sgn;
|
||||
}
|
||||
__DEVICE__
|
||||
inline float coshf(float __x) { return __ocml_cosh_f32(__x); }
|
||||
long long llabs(long long __x) {
|
||||
long long __sgn = __x >> (sizeof(long long) * CHAR_BIT - 1);
|
||||
return (__x ^ __sgn) - __sgn;
|
||||
}
|
||||
#endif
|
||||
|
||||
__DEVICE__
|
||||
inline float cospif(float __x) { return __ocml_cospi_f32(__x); }
|
||||
float acosf(float __x) { return __ocml_acos_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float cyl_bessel_i0f(float __x) { return __ocml_i0_f32(__x); }
|
||||
float acoshf(float __x) { return __ocml_acosh_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float cyl_bessel_i1f(float __x) { return __ocml_i1_f32(__x); }
|
||||
float asinf(float __x) { return __ocml_asin_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float erfcf(float __x) { return __ocml_erfc_f32(__x); }
|
||||
float asinhf(float __x) { return __ocml_asinh_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float erfcinvf(float __x) { return __ocml_erfcinv_f32(__x); }
|
||||
float atan2f(float __x, float __y) { return __ocml_atan2_f32(__x, __y); }
|
||||
|
||||
__DEVICE__
|
||||
inline float erfcxf(float __x) { return __ocml_erfcx_f32(__x); }
|
||||
float atanf(float __x) { return __ocml_atan_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float erff(float __x) { return __ocml_erf_f32(__x); }
|
||||
float atanhf(float __x) { return __ocml_atanh_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float erfinvf(float __x) { return __ocml_erfinv_f32(__x); }
|
||||
float cbrtf(float __x) { return __ocml_cbrt_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float exp10f(float __x) { return __ocml_exp10_f32(__x); }
|
||||
float ceilf(float __x) { return __ocml_ceil_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float exp2f(float __x) { return __ocml_exp2_f32(__x); }
|
||||
float copysignf(float __x, float __y) { return __ocml_copysign_f32(__x, __y); }
|
||||
|
||||
__DEVICE__
|
||||
inline float expf(float __x) { return __ocml_exp_f32(__x); }
|
||||
float cosf(float __x) { return __ocml_cos_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float expm1f(float __x) { return __ocml_expm1_f32(__x); }
|
||||
float coshf(float __x) { return __ocml_cosh_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float fabsf(float __x) { return __ocml_fabs_f32(__x); }
|
||||
float cospif(float __x) { return __ocml_cospi_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float fdimf(float __x, float __y) { return __ocml_fdim_f32(__x, __y); }
|
||||
float cyl_bessel_i0f(float __x) { return __ocml_i0_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float fdividef(float __x, float __y) { return __x / __y; }
|
||||
float cyl_bessel_i1f(float __x) { return __ocml_i1_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float floorf(float __x) { return __ocml_floor_f32(__x); }
|
||||
float erfcf(float __x) { return __ocml_erfc_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float fmaf(float __x, float __y, float __z) {
|
||||
float erfcinvf(float __x) { return __ocml_erfcinv_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
float erfcxf(float __x) { return __ocml_erfcx_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
float erff(float __x) { return __ocml_erf_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
float erfinvf(float __x) { return __ocml_erfinv_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
float exp10f(float __x) { return __ocml_exp10_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
float exp2f(float __x) { return __ocml_exp2_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
float expf(float __x) { return __ocml_exp_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
float expm1f(float __x) { return __ocml_expm1_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
float fabsf(float __x) { return __ocml_fabs_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
float fdimf(float __x, float __y) { return __ocml_fdim_f32(__x, __y); }
|
||||
|
||||
__DEVICE__
|
||||
float fdividef(float __x, float __y) { return __x / __y; }
|
||||
|
||||
__DEVICE__
|
||||
float floorf(float __x) { return __ocml_floor_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
float fmaf(float __x, float __y, float __z) {
|
||||
return __ocml_fma_f32(__x, __y, __z);
|
||||
}
|
||||
|
||||
__DEVICE__
|
||||
inline float fmaxf(float __x, float __y) { return __ocml_fmax_f32(__x, __y); }
|
||||
float fmaxf(float __x, float __y) { return __ocml_fmax_f32(__x, __y); }
|
||||
|
||||
__DEVICE__
|
||||
inline float fminf(float __x, float __y) { return __ocml_fmin_f32(__x, __y); }
|
||||
float fminf(float __x, float __y) { return __ocml_fmin_f32(__x, __y); }
|
||||
|
||||
__DEVICE__
|
||||
inline float fmodf(float __x, float __y) { return __ocml_fmod_f32(__x, __y); }
|
||||
float fmodf(float __x, float __y) { return __ocml_fmod_f32(__x, __y); }
|
||||
|
||||
__DEVICE__
|
||||
inline float frexpf(float __x, int *__nptr) {
|
||||
float frexpf(float __x, int *__nptr) {
|
||||
int __tmp;
|
||||
float __r =
|
||||
__ocml_frexp_f32(__x, (__attribute__((address_space(5))) int *)&__tmp);
|
||||
@@ -174,24 +253,31 @@ inline float frexpf(float __x, int *__nptr) {
|
||||
|
||||
return __r;
|
||||
}
|
||||
|
||||
__DEVICE__
|
||||
inline float hypotf(float __x, float __y) { return __ocml_hypot_f32(__x, __y); }
|
||||
float hypotf(float __x, float __y) { return __ocml_hypot_f32(__x, __y); }
|
||||
|
||||
__DEVICE__
|
||||
inline int ilogbf(float __x) { return __ocml_ilogb_f32(__x); }
|
||||
int ilogbf(float __x) { return __ocml_ilogb_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline __RETURN_TYPE isfinite(float __x) { return __ocml_isfinite_f32(__x); }
|
||||
__RETURN_TYPE __finitef(float __x) { return __ocml_isfinite_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline __RETURN_TYPE isinf(float __x) { return __ocml_isinf_f32(__x); }
|
||||
__RETURN_TYPE __isinff(float __x) { return __ocml_isinf_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline __RETURN_TYPE isnan(float __x) { return __ocml_isnan_f32(__x); }
|
||||
__RETURN_TYPE __isnanf(float __x) { return __ocml_isnan_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float j0f(float __x) { return __ocml_j0_f32(__x); }
|
||||
float j0f(float __x) { return __ocml_j0_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float j1f(float __x) { return __ocml_j1_f32(__x); }
|
||||
float j1f(float __x) { return __ocml_j1_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float jnf(int __n,
|
||||
float __x) { // TODO: we could use Ahmes multiplication
|
||||
// and the Miller & Brown algorithm
|
||||
float jnf(int __n, float __x) { // TODO: we could use Ahmes multiplication
|
||||
// and the Miller & Brown algorithm
|
||||
// for linear recurrences to get O(log n) steps, but it's unclear if
|
||||
// it'd be beneficial in this case.
|
||||
if (__n == 0)
|
||||
@@ -209,50 +295,61 @@ inline float jnf(int __n,
|
||||
|
||||
return __x1;
|
||||
}
|
||||
|
||||
__DEVICE__
|
||||
inline float ldexpf(float __x, int __e) { return __ocml_ldexp_f32(__x, __e); }
|
||||
float ldexpf(float __x, int __e) { return __ocml_ldexp_f32(__x, __e); }
|
||||
|
||||
__DEVICE__
|
||||
inline float lgammaf(float __x) { return __ocml_lgamma_f32(__x); }
|
||||
float lgammaf(float __x) { return __ocml_lgamma_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline long long int llrintf(float __x) { return __ocml_rint_f32(__x); }
|
||||
long long int llrintf(float __x) { return __ocml_rint_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline long long int llroundf(float __x) { return __ocml_round_f32(__x); }
|
||||
long long int llroundf(float __x) { return __ocml_round_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float log10f(float __x) { return __ocml_log10_f32(__x); }
|
||||
float log10f(float __x) { return __ocml_log10_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float log1pf(float __x) { return __ocml_log1p_f32(__x); }
|
||||
float log1pf(float __x) { return __ocml_log1p_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float log2f(float __x) { return __ocml_log2_f32(__x); }
|
||||
float log2f(float __x) { return __ocml_log2_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float logbf(float __x) { return __ocml_logb_f32(__x); }
|
||||
float logbf(float __x) { return __ocml_logb_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float logf(float __x) { return __ocml_log_f32(__x); }
|
||||
float logf(float __x) { return __ocml_log_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline long int lrintf(float __x) { return __ocml_rint_f32(__x); }
|
||||
long int lrintf(float __x) { return __ocml_rint_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline long int lroundf(float __x) { return __ocml_round_f32(__x); }
|
||||
long int lroundf(float __x) { return __ocml_round_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float modff(float __x, float *__iptr) {
|
||||
float modff(float __x, float *__iptr) {
|
||||
float __tmp;
|
||||
float __r =
|
||||
__ocml_modf_f32(__x, (__attribute__((address_space(5))) float *)&__tmp);
|
||||
*__iptr = __tmp;
|
||||
|
||||
return __r;
|
||||
}
|
||||
|
||||
__DEVICE__
|
||||
inline float nanf(const char *__tagp) {
|
||||
float nanf(const char *__tagp) {
|
||||
union {
|
||||
float val;
|
||||
struct ieee_float {
|
||||
uint32_t mantissa : 22;
|
||||
uint32_t quiet : 1;
|
||||
uint32_t exponent : 8;
|
||||
uint32_t sign : 1;
|
||||
unsigned int mantissa : 22;
|
||||
unsigned int quiet : 1;
|
||||
unsigned int exponent : 8;
|
||||
unsigned int sign : 1;
|
||||
} bits;
|
||||
|
||||
static_assert(sizeof(float) == sizeof(ieee_float), "");
|
||||
} __tmp;
|
||||
__static_assert_type_size_equal(sizeof(__tmp.val), sizeof(__tmp.bits));
|
||||
|
||||
__tmp.bits.sign = 0u;
|
||||
__tmp.bits.exponent = ~0u;
|
||||
@@ -261,28 +358,34 @@ inline float nanf(const char *__tagp) {
|
||||
|
||||
return __tmp.val;
|
||||
}
|
||||
|
||||
__DEVICE__
|
||||
inline float nearbyintf(float __x) { return __ocml_nearbyint_f32(__x); }
|
||||
float nearbyintf(float __x) { return __ocml_nearbyint_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float nextafterf(float __x, float __y) {
|
||||
float nextafterf(float __x, float __y) {
|
||||
return __ocml_nextafter_f32(__x, __y);
|
||||
}
|
||||
|
||||
__DEVICE__
|
||||
inline float norm3df(float __x, float __y, float __z) {
|
||||
float norm3df(float __x, float __y, float __z) {
|
||||
return __ocml_len3_f32(__x, __y, __z);
|
||||
}
|
||||
|
||||
__DEVICE__
|
||||
inline float norm4df(float __x, float __y, float __z, float __w) {
|
||||
float norm4df(float __x, float __y, float __z, float __w) {
|
||||
return __ocml_len4_f32(__x, __y, __z, __w);
|
||||
}
|
||||
|
||||
__DEVICE__
|
||||
inline float normcdff(float __x) { return __ocml_ncdf_f32(__x); }
|
||||
float normcdff(float __x) { return __ocml_ncdf_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float normcdfinvf(float __x) { return __ocml_ncdfinv_f32(__x); }
|
||||
float normcdfinvf(float __x) { return __ocml_ncdfinv_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float
|
||||
normf(int __dim,
|
||||
const float *__a) { // TODO: placeholder until OCML adds support.
|
||||
float normf(int __dim,
|
||||
const float *__a) { // TODO: placeholder until OCML adds support.
|
||||
float __r = 0;
|
||||
while (__dim--) {
|
||||
__r += __a[0] * __a[0];
|
||||
@@ -291,16 +394,23 @@ normf(int __dim,
|
||||
|
||||
return __ocml_sqrt_f32(__r);
|
||||
}
|
||||
|
||||
__DEVICE__
|
||||
inline float powf(float __x, float __y) { return __ocml_pow_f32(__x, __y); }
|
||||
float powf(float __x, float __y) { return __ocml_pow_f32(__x, __y); }
|
||||
|
||||
__DEVICE__
|
||||
inline float rcbrtf(float __x) { return __ocml_rcbrt_f32(__x); }
|
||||
float powif(float __x, int __y) { return __ocml_pown_f32(__x, __y); }
|
||||
|
||||
__DEVICE__
|
||||
inline float remainderf(float __x, float __y) {
|
||||
float rcbrtf(float __x) { return __ocml_rcbrt_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
float remainderf(float __x, float __y) {
|
||||
return __ocml_remainder_f32(__x, __y);
|
||||
}
|
||||
|
||||
__DEVICE__
|
||||
inline float remquof(float __x, float __y, int *__quo) {
|
||||
float remquof(float __x, float __y, int *__quo) {
|
||||
int __tmp;
|
||||
float __r = __ocml_remquo_f32(
|
||||
__x, __y, (__attribute__((address_space(5))) int *)&__tmp);
|
||||
@@ -308,25 +418,26 @@ inline float remquof(float __x, float __y, int *__quo) {
|
||||
|
||||
return __r;
|
||||
}
|
||||
|
||||
__DEVICE__
|
||||
inline float rhypotf(float __x, float __y) {
|
||||
return __ocml_rhypot_f32(__x, __y);
|
||||
}
|
||||
float rhypotf(float __x, float __y) { return __ocml_rhypot_f32(__x, __y); }
|
||||
|
||||
__DEVICE__
|
||||
inline float rintf(float __x) { return __ocml_rint_f32(__x); }
|
||||
float rintf(float __x) { return __ocml_rint_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float rnorm3df(float __x, float __y, float __z) {
|
||||
float rnorm3df(float __x, float __y, float __z) {
|
||||
return __ocml_rlen3_f32(__x, __y, __z);
|
||||
}
|
||||
|
||||
__DEVICE__
|
||||
inline float rnorm4df(float __x, float __y, float __z, float __w) {
|
||||
float rnorm4df(float __x, float __y, float __z, float __w) {
|
||||
return __ocml_rlen4_f32(__x, __y, __z, __w);
|
||||
}
|
||||
|
||||
__DEVICE__
|
||||
inline float
|
||||
rnormf(int __dim,
|
||||
const float *__a) { // TODO: placeholder until OCML adds support.
|
||||
float rnormf(int __dim,
|
||||
const float *__a) { // TODO: placeholder until OCML adds support.
|
||||
float __r = 0;
|
||||
while (__dim--) {
|
||||
__r += __a[0] * __a[0];
|
||||
@@ -335,59 +446,74 @@ rnormf(int __dim,
|
||||
|
||||
return __ocml_rsqrt_f32(__r);
|
||||
}
|
||||
|
||||
__DEVICE__
|
||||
inline float roundf(float __x) { return __ocml_round_f32(__x); }
|
||||
float roundf(float __x) { return __ocml_round_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float rsqrtf(float __x) { return __ocml_rsqrt_f32(__x); }
|
||||
float rsqrtf(float __x) { return __ocml_rsqrt_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float scalblnf(float __x, long int __n) {
|
||||
float scalblnf(float __x, long int __n) {
|
||||
return (__n < INT_MAX) ? __ocml_scalbn_f32(__x, __n)
|
||||
: __ocml_scalb_f32(__x, __n);
|
||||
}
|
||||
__DEVICE__
|
||||
inline float scalbnf(float __x, int __n) { return __ocml_scalbn_f32(__x, __n); }
|
||||
__DEVICE__
|
||||
inline __RETURN_TYPE signbit(float __x) { return __ocml_signbit_f32(__x); }
|
||||
__DEVICE__
|
||||
inline void sincosf(float __x, float *__sinptr, float *__cosptr) {
|
||||
float __tmp;
|
||||
|
||||
__DEVICE__
|
||||
float scalbnf(float __x, int __n) { return __ocml_scalbn_f32(__x, __n); }
|
||||
|
||||
__DEVICE__
|
||||
__RETURN_TYPE __signbitf(float __x) { return __ocml_signbit_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
void sincosf(float __x, float *__sinptr, float *__cosptr) {
|
||||
float __tmp;
|
||||
*__sinptr =
|
||||
__ocml_sincos_f32(__x, (__attribute__((address_space(5))) float *)&__tmp);
|
||||
*__cosptr = __tmp;
|
||||
}
|
||||
__DEVICE__
|
||||
inline void sincospif(float __x, float *__sinptr, float *__cosptr) {
|
||||
float __tmp;
|
||||
|
||||
__DEVICE__
|
||||
void sincospif(float __x, float *__sinptr, float *__cosptr) {
|
||||
float __tmp;
|
||||
*__sinptr = __ocml_sincospi_f32(
|
||||
__x, (__attribute__((address_space(5))) float *)&__tmp);
|
||||
*__cosptr = __tmp;
|
||||
}
|
||||
|
||||
__DEVICE__
|
||||
inline float sinf(float __x) { return __ocml_sin_f32(__x); }
|
||||
float sinf(float __x) { return __ocml_sin_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float sinhf(float __x) { return __ocml_sinh_f32(__x); }
|
||||
float sinhf(float __x) { return __ocml_sinh_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float sinpif(float __x) { return __ocml_sinpi_f32(__x); }
|
||||
float sinpif(float __x) { return __ocml_sinpi_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float sqrtf(float __x) { return __ocml_sqrt_f32(__x); }
|
||||
float sqrtf(float __x) { return __ocml_sqrt_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float tanf(float __x) { return __ocml_tan_f32(__x); }
|
||||
float tanf(float __x) { return __ocml_tan_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float tanhf(float __x) { return __ocml_tanh_f32(__x); }
|
||||
float tanhf(float __x) { return __ocml_tanh_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float tgammaf(float __x) { return __ocml_tgamma_f32(__x); }
|
||||
float tgammaf(float __x) { return __ocml_tgamma_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float truncf(float __x) { return __ocml_trunc_f32(__x); }
|
||||
float truncf(float __x) { return __ocml_trunc_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float y0f(float __x) { return __ocml_y0_f32(__x); }
|
||||
float y0f(float __x) { return __ocml_y0_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float y1f(float __x) { return __ocml_y1_f32(__x); }
|
||||
float y1f(float __x) { return __ocml_y1_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float ynf(int __n,
|
||||
float __x) { // TODO: we could use Ahmes multiplication
|
||||
// and the Miller & Brown algorithm
|
||||
float ynf(int __n, float __x) { // TODO: we could use Ahmes multiplication
|
||||
// and the Miller & Brown algorithm
|
||||
// for linear recurrences to get O(log n) steps, but it's unclear if
|
||||
// it'd be beneficial in this case. Placeholder until OCML adds
|
||||
// support.
|
||||
@@ -408,290 +534,343 @@ inline float ynf(int __n,
|
||||
}
|
||||
|
||||
// BEGIN INTRINSICS
|
||||
|
||||
__DEVICE__
|
||||
inline float __cosf(float __x) { return __ocml_native_cos_f32(__x); }
|
||||
float __cosf(float __x) { return __ocml_native_cos_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float __exp10f(float __x) { return __ocml_native_exp10_f32(__x); }
|
||||
float __exp10f(float __x) { return __ocml_native_exp10_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float __expf(float __x) { return __ocml_native_exp_f32(__x); }
|
||||
float __expf(float __x) { return __ocml_native_exp_f32(__x); }
|
||||
|
||||
#if defined OCML_BASIC_ROUNDED_OPERATIONS
|
||||
__DEVICE__
|
||||
inline float __fadd_rd(float __x, float __y) {
|
||||
return __ocml_add_rtn_f32(__x, __y);
|
||||
}
|
||||
float __fadd_rd(float __x, float __y) { return __ocml_add_rtn_f32(__x, __y); }
|
||||
__DEVICE__
|
||||
float __fadd_rn(float __x, float __y) { return __ocml_add_rte_f32(__x, __y); }
|
||||
__DEVICE__
|
||||
float __fadd_ru(float __x, float __y) { return __ocml_add_rtp_f32(__x, __y); }
|
||||
__DEVICE__
|
||||
float __fadd_rz(float __x, float __y) { return __ocml_add_rtz_f32(__x, __y); }
|
||||
#else
|
||||
__DEVICE__
|
||||
float __fadd_rn(float __x, float __y) { return __x + __y; }
|
||||
#endif
|
||||
__DEVICE__
|
||||
inline float __fadd_rn(float __x, float __y) { return __x + __y; }
|
||||
|
||||
#if defined OCML_BASIC_ROUNDED_OPERATIONS
|
||||
__DEVICE__
|
||||
inline float __fadd_ru(float __x, float __y) {
|
||||
return __ocml_add_rtp_f32(__x, __y);
|
||||
}
|
||||
float __fdiv_rd(float __x, float __y) { return __ocml_div_rtn_f32(__x, __y); }
|
||||
__DEVICE__
|
||||
inline float __fadd_rz(float __x, float __y) {
|
||||
return __ocml_add_rtz_f32(__x, __y);
|
||||
}
|
||||
float __fdiv_rn(float __x, float __y) { return __ocml_div_rte_f32(__x, __y); }
|
||||
__DEVICE__
|
||||
inline float __fdiv_rd(float __x, float __y) {
|
||||
return __ocml_div_rtn_f32(__x, __y);
|
||||
}
|
||||
float __fdiv_ru(float __x, float __y) { return __ocml_div_rtp_f32(__x, __y); }
|
||||
__DEVICE__
|
||||
float __fdiv_rz(float __x, float __y) { return __ocml_div_rtz_f32(__x, __y); }
|
||||
#else
|
||||
__DEVICE__
|
||||
float __fdiv_rn(float __x, float __y) { return __x / __y; }
|
||||
#endif
|
||||
|
||||
__DEVICE__
|
||||
inline float __fdiv_rn(float __x, float __y) { return __x / __y; }
|
||||
float __fdividef(float __x, float __y) { return __x / __y; }
|
||||
|
||||
#if defined OCML_BASIC_ROUNDED_OPERATIONS
|
||||
__DEVICE__
|
||||
inline float __fdiv_ru(float __x, float __y) {
|
||||
return __ocml_div_rtp_f32(__x, __y);
|
||||
}
|
||||
__DEVICE__
|
||||
inline float __fdiv_rz(float __x, float __y) {
|
||||
return __ocml_div_rtz_f32(__x, __y);
|
||||
}
|
||||
#endif
|
||||
__DEVICE__
|
||||
inline float __fdividef(float __x, float __y) { return __x / __y; }
|
||||
#if defined OCML_BASIC_ROUNDED_OPERATIONS
|
||||
__DEVICE__
|
||||
inline float __fmaf_rd(float __x, float __y, float __z) {
|
||||
float __fmaf_rd(float __x, float __y, float __z) {
|
||||
return __ocml_fma_rtn_f32(__x, __y, __z);
|
||||
}
|
||||
#endif
|
||||
__DEVICE__
|
||||
inline float __fmaf_rn(float __x, float __y, float __z) {
|
||||
return __ocml_fma_f32(__x, __y, __z);
|
||||
float __fmaf_rn(float __x, float __y, float __z) {
|
||||
return __ocml_fma_rte_f32(__x, __y, __z);
|
||||
}
|
||||
#if defined OCML_BASIC_ROUNDED_OPERATIONS
|
||||
__DEVICE__
|
||||
inline float __fmaf_ru(float __x, float __y, float __z) {
|
||||
float __fmaf_ru(float __x, float __y, float __z) {
|
||||
return __ocml_fma_rtp_f32(__x, __y, __z);
|
||||
}
|
||||
__DEVICE__
|
||||
inline float __fmaf_rz(float __x, float __y, float __z) {
|
||||
float __fmaf_rz(float __x, float __y, float __z) {
|
||||
return __ocml_fma_rtz_f32(__x, __y, __z);
|
||||
}
|
||||
#else
|
||||
__DEVICE__
|
||||
inline float __fmul_rd(float __x, float __y) {
|
||||
return __ocml_mul_rtn_f32(__x, __y);
|
||||
float __fmaf_rn(float __x, float __y, float __z) {
|
||||
return __ocml_fma_f32(__x, __y, __z);
|
||||
}
|
||||
#endif
|
||||
__DEVICE__
|
||||
inline float __fmul_rn(float __x, float __y) { return __x * __y; }
|
||||
|
||||
#if defined OCML_BASIC_ROUNDED_OPERATIONS
|
||||
__DEVICE__
|
||||
inline float __fmul_ru(float __x, float __y) {
|
||||
return __ocml_mul_rtp_f32(__x, __y);
|
||||
}
|
||||
float __fmul_rd(float __x, float __y) { return __ocml_mul_rtn_f32(__x, __y); }
|
||||
__DEVICE__
|
||||
inline float __fmul_rz(float __x, float __y) {
|
||||
return __ocml_mul_rtz_f32(__x, __y);
|
||||
}
|
||||
float __fmul_rn(float __x, float __y) { return __ocml_mul_rte_f32(__x, __y); }
|
||||
__DEVICE__
|
||||
inline float __frcp_rd(float __x) { return __llvm_amdgcn_rcp_f32(__x); }
|
||||
float __fmul_ru(float __x, float __y) { return __ocml_mul_rtp_f32(__x, __y); }
|
||||
__DEVICE__
|
||||
float __fmul_rz(float __x, float __y) { return __ocml_mul_rtz_f32(__x, __y); }
|
||||
#else
|
||||
__DEVICE__
|
||||
float __fmul_rn(float __x, float __y) { return __x * __y; }
|
||||
#endif
|
||||
__DEVICE__
|
||||
inline float __frcp_rn(float __x) { return __llvm_amdgcn_rcp_f32(__x); }
|
||||
|
||||
#if defined OCML_BASIC_ROUNDED_OPERATIONS
|
||||
__DEVICE__
|
||||
inline float __frcp_ru(float __x) { return __llvm_amdgcn_rcp_f32(__x); }
|
||||
float __frcp_rd(float __x) { return __ocml_div_rtn_f32(1.0f, __x); }
|
||||
__DEVICE__
|
||||
inline float __frcp_rz(float __x) { return __llvm_amdgcn_rcp_f32(__x); }
|
||||
float __frcp_rn(float __x) { return __ocml_div_rte_f32(1.0f, __x); }
|
||||
__DEVICE__
|
||||
float __frcp_ru(float __x) { return __ocml_div_rtp_f32(1.0f, __x); }
|
||||
__DEVICE__
|
||||
float __frcp_rz(float __x) { return __ocml_div_rtz_f32(1.0f, __x); }
|
||||
#else
|
||||
__DEVICE__
|
||||
float __frcp_rn(float __x) { return 1.0f / __x; }
|
||||
#endif
|
||||
|
||||
__DEVICE__
|
||||
inline float __frsqrt_rn(float __x) { return __llvm_amdgcn_rsq_f32(__x); }
|
||||
float __frsqrt_rn(float __x) { return __llvm_amdgcn_rsq_f32(__x); }
|
||||
|
||||
#if defined OCML_BASIC_ROUNDED_OPERATIONS
|
||||
__DEVICE__
|
||||
inline float __fsqrt_rd(float __x) { return __ocml_sqrt_rtn_f32(__x); }
|
||||
#endif
|
||||
float __fsqrt_rd(float __x) { return __ocml_sqrt_rtn_f32(__x); }
|
||||
__DEVICE__
|
||||
inline float __fsqrt_rn(float __x) { return __ocml_native_sqrt_f32(__x); }
|
||||
float __fsqrt_rn(float __x) { return __ocml_sqrt_rte_f32(__x); }
|
||||
__DEVICE__
|
||||
float __fsqrt_ru(float __x) { return __ocml_sqrt_rtp_f32(__x); }
|
||||
__DEVICE__
|
||||
float __fsqrt_rz(float __x) { return __ocml_sqrt_rtz_f32(__x); }
|
||||
#else
|
||||
__DEVICE__
|
||||
float __fsqrt_rn(float __x) { return __ocml_native_sqrt_f32(__x); }
|
||||
#endif
|
||||
|
||||
#if defined OCML_BASIC_ROUNDED_OPERATIONS
|
||||
__DEVICE__
|
||||
inline float __fsqrt_ru(float __x) { return __ocml_sqrt_rtp_f32(__x); }
|
||||
float __fsub_rd(float __x, float __y) { return __ocml_sub_rtn_f32(__x, __y); }
|
||||
__DEVICE__
|
||||
inline float __fsqrt_rz(float __x) { return __ocml_sqrt_rtz_f32(__x); }
|
||||
float __fsub_rn(float __x, float __y) { return __ocml_sub_rte_f32(__x, __y); }
|
||||
__DEVICE__
|
||||
inline float __fsub_rd(float __x, float __y) {
|
||||
return __ocml_sub_rtn_f32(__x, __y);
|
||||
}
|
||||
float __fsub_ru(float __x, float __y) { return __ocml_sub_rtp_f32(__x, __y); }
|
||||
__DEVICE__
|
||||
float __fsub_rz(float __x, float __y) { return __ocml_sub_rtz_f32(__x, __y); }
|
||||
#else
|
||||
__DEVICE__
|
||||
float __fsub_rn(float __x, float __y) { return __x - __y; }
|
||||
#endif
|
||||
|
||||
__DEVICE__
|
||||
inline float __fsub_rn(float __x, float __y) { return __x - __y; }
|
||||
#if defined OCML_BASIC_ROUNDED_OPERATIONS
|
||||
float __log10f(float __x) { return __ocml_native_log10_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float __fsub_ru(float __x, float __y) {
|
||||
return __ocml_sub_rtp_f32(__x, __y);
|
||||
}
|
||||
float __log2f(float __x) { return __ocml_native_log2_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float __fsub_rz(float __x, float __y) {
|
||||
return __ocml_sub_rtz_f32(__x, __y);
|
||||
}
|
||||
#endif
|
||||
float __logf(float __x) { return __ocml_native_log_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float __log10f(float __x) { return __ocml_native_log10_f32(__x); }
|
||||
float __powf(float __x, float __y) { return __ocml_pow_f32(__x, __y); }
|
||||
|
||||
__DEVICE__
|
||||
inline float __log2f(float __x) { return __ocml_native_log2_f32(__x); }
|
||||
float __saturatef(float __x) { return (__x < 0) ? 0 : ((__x > 1) ? 1 : __x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float __logf(float __x) { return __ocml_native_log_f32(__x); }
|
||||
__DEVICE__
|
||||
inline float __powf(float __x, float __y) { return __ocml_pow_f32(__x, __y); }
|
||||
__DEVICE__
|
||||
inline float __saturatef(float __x) {
|
||||
return (__x < 0) ? 0 : ((__x > 1) ? 1 : __x);
|
||||
}
|
||||
__DEVICE__
|
||||
inline void __sincosf(float __x, float *__sinptr, float *__cosptr) {
|
||||
void __sincosf(float __x, float *__sinptr, float *__cosptr) {
|
||||
*__sinptr = __ocml_native_sin_f32(__x);
|
||||
*__cosptr = __ocml_native_cos_f32(__x);
|
||||
}
|
||||
|
||||
__DEVICE__
|
||||
inline float __sinf(float __x) { return __ocml_native_sin_f32(__x); }
|
||||
float __sinf(float __x) { return __ocml_native_sin_f32(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline float __tanf(float __x) { return __ocml_tan_f32(__x); }
|
||||
float __tanf(float __x) { return __ocml_tan_f32(__x); }
|
||||
// END INTRINSICS
|
||||
// END FLOAT
|
||||
|
||||
// BEGIN DOUBLE
|
||||
__DEVICE__
|
||||
inline double abs(double __x) { return __ocml_fabs_f64(__x); }
|
||||
double acos(double __x) { return __ocml_acos_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double acos(double __x) { return __ocml_acos_f64(__x); }
|
||||
double acosh(double __x) { return __ocml_acosh_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double acosh(double __x) { return __ocml_acosh_f64(__x); }
|
||||
double asin(double __x) { return __ocml_asin_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double asin(double __x) { return __ocml_asin_f64(__x); }
|
||||
double asinh(double __x) { return __ocml_asinh_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double asinh(double __x) { return __ocml_asinh_f64(__x); }
|
||||
double atan(double __x) { return __ocml_atan_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double atan(double __x) { return __ocml_atan_f64(__x); }
|
||||
double atan2(double __x, double __y) { return __ocml_atan2_f64(__x, __y); }
|
||||
|
||||
__DEVICE__
|
||||
inline double atan2(double __x, double __y) {
|
||||
return __ocml_atan2_f64(__x, __y);
|
||||
}
|
||||
double atanh(double __x) { return __ocml_atanh_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double atanh(double __x) { return __ocml_atanh_f64(__x); }
|
||||
double cbrt(double __x) { return __ocml_cbrt_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double cbrt(double __x) { return __ocml_cbrt_f64(__x); }
|
||||
double ceil(double __x) { return __ocml_ceil_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double ceil(double __x) { return __ocml_ceil_f64(__x); }
|
||||
__DEVICE__
|
||||
inline double copysign(double __x, double __y) {
|
||||
double copysign(double __x, double __y) {
|
||||
return __ocml_copysign_f64(__x, __y);
|
||||
}
|
||||
|
||||
__DEVICE__
|
||||
inline double cos(double __x) { return __ocml_cos_f64(__x); }
|
||||
double cos(double __x) { return __ocml_cos_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double cosh(double __x) { return __ocml_cosh_f64(__x); }
|
||||
double cosh(double __x) { return __ocml_cosh_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double cospi(double __x) { return __ocml_cospi_f64(__x); }
|
||||
double cospi(double __x) { return __ocml_cospi_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double cyl_bessel_i0(double __x) { return __ocml_i0_f64(__x); }
|
||||
double cyl_bessel_i0(double __x) { return __ocml_i0_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double cyl_bessel_i1(double __x) { return __ocml_i1_f64(__x); }
|
||||
double cyl_bessel_i1(double __x) { return __ocml_i1_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double erf(double __x) { return __ocml_erf_f64(__x); }
|
||||
double erf(double __x) { return __ocml_erf_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double erfc(double __x) { return __ocml_erfc_f64(__x); }
|
||||
double erfc(double __x) { return __ocml_erfc_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double erfcinv(double __x) { return __ocml_erfcinv_f64(__x); }
|
||||
double erfcinv(double __x) { return __ocml_erfcinv_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double erfcx(double __x) { return __ocml_erfcx_f64(__x); }
|
||||
double erfcx(double __x) { return __ocml_erfcx_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double erfinv(double __x) { return __ocml_erfinv_f64(__x); }
|
||||
double erfinv(double __x) { return __ocml_erfinv_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double exp(double __x) { return __ocml_exp_f64(__x); }
|
||||
double exp(double __x) { return __ocml_exp_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double exp10(double __x) { return __ocml_exp10_f64(__x); }
|
||||
double exp10(double __x) { return __ocml_exp10_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double exp2(double __x) { return __ocml_exp2_f64(__x); }
|
||||
double exp2(double __x) { return __ocml_exp2_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double expm1(double __x) { return __ocml_expm1_f64(__x); }
|
||||
double expm1(double __x) { return __ocml_expm1_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double fabs(double __x) { return __ocml_fabs_f64(__x); }
|
||||
double fabs(double __x) { return __ocml_fabs_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double fdim(double __x, double __y) { return __ocml_fdim_f64(__x, __y); }
|
||||
double fdim(double __x, double __y) { return __ocml_fdim_f64(__x, __y); }
|
||||
|
||||
__DEVICE__
|
||||
inline double floor(double __x) { return __ocml_floor_f64(__x); }
|
||||
double floor(double __x) { return __ocml_floor_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double fma(double __x, double __y, double __z) {
|
||||
double fma(double __x, double __y, double __z) {
|
||||
return __ocml_fma_f64(__x, __y, __z);
|
||||
}
|
||||
|
||||
__DEVICE__
|
||||
inline double fmax(double __x, double __y) { return __ocml_fmax_f64(__x, __y); }
|
||||
double fmax(double __x, double __y) { return __ocml_fmax_f64(__x, __y); }
|
||||
|
||||
__DEVICE__
|
||||
inline double fmin(double __x, double __y) { return __ocml_fmin_f64(__x, __y); }
|
||||
double fmin(double __x, double __y) { return __ocml_fmin_f64(__x, __y); }
|
||||
|
||||
__DEVICE__
|
||||
inline double fmod(double __x, double __y) { return __ocml_fmod_f64(__x, __y); }
|
||||
double fmod(double __x, double __y) { return __ocml_fmod_f64(__x, __y); }
|
||||
|
||||
__DEVICE__
|
||||
inline double frexp(double __x, int *__nptr) {
|
||||
double frexp(double __x, int *__nptr) {
|
||||
int __tmp;
|
||||
double __r =
|
||||
__ocml_frexp_f64(__x, (__attribute__((address_space(5))) int *)&__tmp);
|
||||
*__nptr = __tmp;
|
||||
|
||||
return __r;
|
||||
}
|
||||
|
||||
__DEVICE__
|
||||
inline double hypot(double __x, double __y) {
|
||||
return __ocml_hypot_f64(__x, __y);
|
||||
}
|
||||
double hypot(double __x, double __y) { return __ocml_hypot_f64(__x, __y); }
|
||||
|
||||
__DEVICE__
|
||||
inline int ilogb(double __x) { return __ocml_ilogb_f64(__x); }
|
||||
int ilogb(double __x) { return __ocml_ilogb_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline __RETURN_TYPE isfinite(double __x) { return __ocml_isfinite_f64(__x); }
|
||||
__RETURN_TYPE __finite(double __x) { return __ocml_isfinite_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline __RETURN_TYPE isinf(double __x) { return __ocml_isinf_f64(__x); }
|
||||
__RETURN_TYPE __isinf(double __x) { return __ocml_isinf_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline __RETURN_TYPE isnan(double __x) { return __ocml_isnan_f64(__x); }
|
||||
__RETURN_TYPE __isnan(double __x) { return __ocml_isnan_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double j0(double __x) { return __ocml_j0_f64(__x); }
|
||||
double j0(double __x) { return __ocml_j0_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double j1(double __x) { return __ocml_j1_f64(__x); }
|
||||
double j1(double __x) { return __ocml_j1_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double jn(int __n,
|
||||
double __x) { // TODO: we could use Ahmes multiplication
|
||||
// and the Miller & Brown algorithm
|
||||
double jn(int __n, double __x) { // TODO: we could use Ahmes multiplication
|
||||
// and the Miller & Brown algorithm
|
||||
// for linear recurrences to get O(log n) steps, but it's unclear if
|
||||
// it'd be beneficial in this case. Placeholder until OCML adds
|
||||
// support.
|
||||
if (__n == 0)
|
||||
return j0f(__x);
|
||||
return j0(__x);
|
||||
if (__n == 1)
|
||||
return j1f(__x);
|
||||
return j1(__x);
|
||||
|
||||
double __x0 = j0f(__x);
|
||||
double __x1 = j1f(__x);
|
||||
double __x0 = j0(__x);
|
||||
double __x1 = j1(__x);
|
||||
for (int __i = 1; __i < __n; ++__i) {
|
||||
double __x2 = (2 * __i) / __x * __x1 - __x0;
|
||||
__x0 = __x1;
|
||||
__x1 = __x2;
|
||||
}
|
||||
|
||||
return __x1;
|
||||
}
|
||||
|
||||
__DEVICE__
|
||||
inline double ldexp(double __x, int __e) { return __ocml_ldexp_f64(__x, __e); }
|
||||
double ldexp(double __x, int __e) { return __ocml_ldexp_f64(__x, __e); }
|
||||
|
||||
__DEVICE__
|
||||
inline double lgamma(double __x) { return __ocml_lgamma_f64(__x); }
|
||||
double lgamma(double __x) { return __ocml_lgamma_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline long long int llrint(double __x) { return __ocml_rint_f64(__x); }
|
||||
long long int llrint(double __x) { return __ocml_rint_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline long long int llround(double __x) { return __ocml_round_f64(__x); }
|
||||
long long int llround(double __x) { return __ocml_round_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double log(double __x) { return __ocml_log_f64(__x); }
|
||||
double log(double __x) { return __ocml_log_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double log10(double __x) { return __ocml_log10_f64(__x); }
|
||||
double log10(double __x) { return __ocml_log10_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double log1p(double __x) { return __ocml_log1p_f64(__x); }
|
||||
double log1p(double __x) { return __ocml_log1p_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double log2(double __x) { return __ocml_log2_f64(__x); }
|
||||
double log2(double __x) { return __ocml_log2_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double logb(double __x) { return __ocml_logb_f64(__x); }
|
||||
double logb(double __x) { return __ocml_logb_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline long int lrint(double __x) { return __ocml_rint_f64(__x); }
|
||||
long int lrint(double __x) { return __ocml_rint_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline long int lround(double __x) { return __ocml_round_f64(__x); }
|
||||
long int lround(double __x) { return __ocml_round_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double modf(double __x, double *__iptr) {
|
||||
double modf(double __x, double *__iptr) {
|
||||
double __tmp;
|
||||
double __r =
|
||||
__ocml_modf_f64(__x, (__attribute__((address_space(5))) double *)&__tmp);
|
||||
@@ -699,8 +878,9 @@ inline double modf(double __x, double *__iptr) {
|
||||
|
||||
return __r;
|
||||
}
|
||||
|
||||
__DEVICE__
|
||||
inline double nan(const char *__tagp) {
|
||||
double nan(const char *__tagp) {
|
||||
#if !_WIN32
|
||||
union {
|
||||
double val;
|
||||
@@ -710,8 +890,8 @@ inline double nan(const char *__tagp) {
|
||||
uint32_t exponent : 11;
|
||||
uint32_t sign : 1;
|
||||
} bits;
|
||||
static_assert(sizeof(double) == sizeof(ieee_double), "");
|
||||
} __tmp;
|
||||
__static_assert_type_size_equal(sizeof(__tmp.val), sizeof(__tmp.bits));
|
||||
|
||||
__tmp.bits.sign = 0u;
|
||||
__tmp.bits.exponent = ~0u;
|
||||
@@ -720,22 +900,24 @@ inline double nan(const char *__tagp) {
|
||||
|
||||
return __tmp.val;
|
||||
#else
|
||||
static_assert(sizeof(uint64_t) == sizeof(double));
|
||||
uint64_t val = __make_mantissa(__tagp);
|
||||
val |= 0xFFF << 51;
|
||||
return *reinterpret_cast<double *>(&val);
|
||||
__static_assert_type_size_equal(sizeof(uint64_t), sizeof(double));
|
||||
uint64_t __val = __make_mantissa(__tagp);
|
||||
__val |= 0xFFF << 51;
|
||||
return *reinterpret_cast<double *>(&__val);
|
||||
#endif
|
||||
}
|
||||
|
||||
__DEVICE__
|
||||
inline double nearbyint(double __x) { return __ocml_nearbyint_f64(__x); }
|
||||
double nearbyint(double __x) { return __ocml_nearbyint_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double nextafter(double __x, double __y) {
|
||||
double nextafter(double __x, double __y) {
|
||||
return __ocml_nextafter_f64(__x, __y);
|
||||
}
|
||||
|
||||
__DEVICE__
|
||||
inline double
|
||||
norm(int __dim,
|
||||
const double *__a) { // TODO: placeholder until OCML adds support.
|
||||
double norm(int __dim,
|
||||
const double *__a) { // TODO: placeholder until OCML adds support.
|
||||
double __r = 0;
|
||||
while (__dim--) {
|
||||
__r += __a[0] * __a[0];
|
||||
@@ -744,28 +926,39 @@ norm(int __dim,
|
||||
|
||||
return __ocml_sqrt_f64(__r);
|
||||
}
|
||||
|
||||
__DEVICE__
|
||||
inline double norm3d(double __x, double __y, double __z) {
|
||||
double norm3d(double __x, double __y, double __z) {
|
||||
return __ocml_len3_f64(__x, __y, __z);
|
||||
}
|
||||
|
||||
__DEVICE__
|
||||
inline double norm4d(double __x, double __y, double __z, double __w) {
|
||||
double norm4d(double __x, double __y, double __z, double __w) {
|
||||
return __ocml_len4_f64(__x, __y, __z, __w);
|
||||
}
|
||||
|
||||
__DEVICE__
|
||||
inline double normcdf(double __x) { return __ocml_ncdf_f64(__x); }
|
||||
double normcdf(double __x) { return __ocml_ncdf_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double normcdfinv(double __x) { return __ocml_ncdfinv_f64(__x); }
|
||||
double normcdfinv(double __x) { return __ocml_ncdfinv_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double pow(double __x, double __y) { return __ocml_pow_f64(__x, __y); }
|
||||
double pow(double __x, double __y) { return __ocml_pow_f64(__x, __y); }
|
||||
|
||||
__DEVICE__
|
||||
inline double rcbrt(double __x) { return __ocml_rcbrt_f64(__x); }
|
||||
double powi(double __x, int __y) { return __ocml_pown_f64(__x, __y); }
|
||||
|
||||
__DEVICE__
|
||||
inline double remainder(double __x, double __y) {
|
||||
double rcbrt(double __x) { return __ocml_rcbrt_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
double remainder(double __x, double __y) {
|
||||
return __ocml_remainder_f64(__x, __y);
|
||||
}
|
||||
|
||||
__DEVICE__
|
||||
inline double remquo(double __x, double __y, int *__quo) {
|
||||
double remquo(double __x, double __y, int *__quo) {
|
||||
int __tmp;
|
||||
double __r = __ocml_remquo_f64(
|
||||
__x, __y, (__attribute__((address_space(5))) int *)&__tmp);
|
||||
@@ -773,16 +966,16 @@ inline double remquo(double __x, double __y, int *__quo) {
|
||||
|
||||
return __r;
|
||||
}
|
||||
|
||||
__DEVICE__
|
||||
inline double rhypot(double __x, double __y) {
|
||||
return __ocml_rhypot_f64(__x, __y);
|
||||
}
|
||||
double rhypot(double __x, double __y) { return __ocml_rhypot_f64(__x, __y); }
|
||||
|
||||
__DEVICE__
|
||||
inline double rint(double __x) { return __ocml_rint_f64(__x); }
|
||||
double rint(double __x) { return __ocml_rint_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double
|
||||
rnorm(int __dim,
|
||||
const double *__a) { // TODO: placeholder until OCML adds support.
|
||||
double rnorm(int __dim,
|
||||
const double *__a) { // TODO: placeholder until OCML adds support.
|
||||
double __r = 0;
|
||||
while (__dim--) {
|
||||
__r += __a[0] * __a[0];
|
||||
@@ -791,77 +984,93 @@ rnorm(int __dim,
|
||||
|
||||
return __ocml_rsqrt_f64(__r);
|
||||
}
|
||||
|
||||
__DEVICE__
|
||||
inline double rnorm3d(double __x, double __y, double __z) {
|
||||
double rnorm3d(double __x, double __y, double __z) {
|
||||
return __ocml_rlen3_f64(__x, __y, __z);
|
||||
}
|
||||
|
||||
__DEVICE__
|
||||
inline double rnorm4d(double __x, double __y, double __z, double __w) {
|
||||
double rnorm4d(double __x, double __y, double __z, double __w) {
|
||||
return __ocml_rlen4_f64(__x, __y, __z, __w);
|
||||
}
|
||||
|
||||
__DEVICE__
|
||||
inline double round(double __x) { return __ocml_round_f64(__x); }
|
||||
double round(double __x) { return __ocml_round_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double rsqrt(double __x) { return __ocml_rsqrt_f64(__x); }
|
||||
double rsqrt(double __x) { return __ocml_rsqrt_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double scalbln(double __x, long int __n) {
|
||||
double scalbln(double __x, long int __n) {
|
||||
return (__n < INT_MAX) ? __ocml_scalbn_f64(__x, __n)
|
||||
: __ocml_scalb_f64(__x, __n);
|
||||
}
|
||||
__DEVICE__
|
||||
inline double scalbn(double __x, int __n) {
|
||||
return __ocml_scalbn_f64(__x, __n);
|
||||
}
|
||||
double scalbn(double __x, int __n) { return __ocml_scalbn_f64(__x, __n); }
|
||||
|
||||
__DEVICE__
|
||||
inline __RETURN_TYPE signbit(double __x) { return __ocml_signbit_f64(__x); }
|
||||
__RETURN_TYPE __signbit(double __x) { return __ocml_signbit_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double sin(double __x) { return __ocml_sin_f64(__x); }
|
||||
double sin(double __x) { return __ocml_sin_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline void sincos(double __x, double *__sinptr, double *__cosptr) {
|
||||
void sincos(double __x, double *__sinptr, double *__cosptr) {
|
||||
double __tmp;
|
||||
*__sinptr = __ocml_sincos_f64(
|
||||
__x, (__attribute__((address_space(5))) double *)&__tmp);
|
||||
*__cosptr = __tmp;
|
||||
}
|
||||
|
||||
__DEVICE__
|
||||
inline void sincospi(double __x, double *__sinptr, double *__cosptr) {
|
||||
void sincospi(double __x, double *__sinptr, double *__cosptr) {
|
||||
double __tmp;
|
||||
*__sinptr = __ocml_sincospi_f64(
|
||||
__x, (__attribute__((address_space(5))) double *)&__tmp);
|
||||
*__cosptr = __tmp;
|
||||
}
|
||||
|
||||
__DEVICE__
|
||||
inline double sinh(double __x) { return __ocml_sinh_f64(__x); }
|
||||
double sinh(double __x) { return __ocml_sinh_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double sinpi(double __x) { return __ocml_sinpi_f64(__x); }
|
||||
double sinpi(double __x) { return __ocml_sinpi_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double sqrt(double __x) { return __ocml_sqrt_f64(__x); }
|
||||
double sqrt(double __x) { return __ocml_sqrt_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double tan(double __x) { return __ocml_tan_f64(__x); }
|
||||
double tan(double __x) { return __ocml_tan_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double tanh(double __x) { return __ocml_tanh_f64(__x); }
|
||||
double tanh(double __x) { return __ocml_tanh_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double tgamma(double __x) { return __ocml_tgamma_f64(__x); }
|
||||
double tgamma(double __x) { return __ocml_tgamma_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double trunc(double __x) { return __ocml_trunc_f64(__x); }
|
||||
double trunc(double __x) { return __ocml_trunc_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double y0(double __x) { return __ocml_y0_f64(__x); }
|
||||
double y0(double __x) { return __ocml_y0_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double y1(double __x) { return __ocml_y1_f64(__x); }
|
||||
double y1(double __x) { return __ocml_y1_f64(__x); }
|
||||
|
||||
__DEVICE__
|
||||
inline double yn(int __n,
|
||||
double __x) { // TODO: we could use Ahmes multiplication
|
||||
// and the Miller & Brown algorithm
|
||||
double yn(int __n, double __x) { // TODO: we could use Ahmes multiplication
|
||||
// and the Miller & Brown algorithm
|
||||
// for linear recurrences to get O(log n) steps, but it's unclear if
|
||||
// it'd be beneficial in this case. Placeholder until OCML adds
|
||||
// support.
|
||||
if (__n == 0)
|
||||
return j0f(__x);
|
||||
return y0(__x);
|
||||
if (__n == 1)
|
||||
return j1f(__x);
|
||||
return y1(__x);
|
||||
|
||||
double __x0 = j0f(__x);
|
||||
double __x1 = j1f(__x);
|
||||
double __x0 = y0(__x);
|
||||
double __x1 = y1(__x);
|
||||
for (int __i = 1; __i < __n; ++__i) {
|
||||
double __x2 = (2 * __i) / __x * __x1 - __x0;
|
||||
__x0 = __x1;
|
||||
@@ -874,296 +1083,182 @@ inline double yn(int __n,
|
||||
// BEGIN INTRINSICS
|
||||
#if defined OCML_BASIC_ROUNDED_OPERATIONS
|
||||
__DEVICE__
|
||||
inline double __dadd_rd(double __x, double __y) {
|
||||
double __dadd_rd(double __x, double __y) {
|
||||
return __ocml_add_rtn_f64(__x, __y);
|
||||
}
|
||||
#endif
|
||||
__DEVICE__
|
||||
inline double __dadd_rn(double __x, double __y) { return __x + __y; }
|
||||
#if defined OCML_BASIC_ROUNDED_OPERATIONS
|
||||
double __dadd_rn(double __x, double __y) {
|
||||
return __ocml_add_rte_f64(__x, __y);
|
||||
}
|
||||
__DEVICE__
|
||||
inline double __dadd_ru(double __x, double __y) {
|
||||
double __dadd_ru(double __x, double __y) {
|
||||
return __ocml_add_rtp_f64(__x, __y);
|
||||
}
|
||||
__DEVICE__
|
||||
inline double __dadd_rz(double __x, double __y) {
|
||||
double __dadd_rz(double __x, double __y) {
|
||||
return __ocml_add_rtz_f64(__x, __y);
|
||||
}
|
||||
#else
|
||||
__DEVICE__
|
||||
inline double __ddiv_rd(double __x, double __y) {
|
||||
return __ocml_div_rtn_f64(__x, __y);
|
||||
}
|
||||
double __dadd_rn(double __x, double __y) { return __x + __y; }
|
||||
#endif
|
||||
__DEVICE__
|
||||
inline double __ddiv_rn(double __x, double __y) { return __x / __y; }
|
||||
|
||||
#if defined OCML_BASIC_ROUNDED_OPERATIONS
|
||||
__DEVICE__
|
||||
inline double __ddiv_ru(double __x, double __y) {
|
||||
double __ddiv_rd(double __x, double __y) {
|
||||
return __ocml_div_rtn_f64(__x, __y);
|
||||
}
|
||||
__DEVICE__
|
||||
double __ddiv_rn(double __x, double __y) {
|
||||
return __ocml_div_rte_f64(__x, __y);
|
||||
}
|
||||
__DEVICE__
|
||||
double __ddiv_ru(double __x, double __y) {
|
||||
return __ocml_div_rtp_f64(__x, __y);
|
||||
}
|
||||
__DEVICE__
|
||||
inline double __ddiv_rz(double __x, double __y) {
|
||||
double __ddiv_rz(double __x, double __y) {
|
||||
return __ocml_div_rtz_f64(__x, __y);
|
||||
}
|
||||
#else
|
||||
__DEVICE__
|
||||
inline double __dmul_rd(double __x, double __y) {
|
||||
return __ocml_mul_rtn_f64(__x, __y);
|
||||
}
|
||||
double __ddiv_rn(double __x, double __y) { return __x / __y; }
|
||||
#endif
|
||||
__DEVICE__
|
||||
inline double __dmul_rn(double __x, double __y) { return __x * __y; }
|
||||
|
||||
#if defined OCML_BASIC_ROUNDED_OPERATIONS
|
||||
__DEVICE__
|
||||
inline double __dmul_ru(double __x, double __y) {
|
||||
double __dmul_rd(double __x, double __y) {
|
||||
return __ocml_mul_rtn_f64(__x, __y);
|
||||
}
|
||||
__DEVICE__
|
||||
double __dmul_rn(double __x, double __y) {
|
||||
return __ocml_mul_rte_f64(__x, __y);
|
||||
}
|
||||
__DEVICE__
|
||||
double __dmul_ru(double __x, double __y) {
|
||||
return __ocml_mul_rtp_f64(__x, __y);
|
||||
}
|
||||
__DEVICE__
|
||||
inline double __dmul_rz(double __x, double __y) {
|
||||
double __dmul_rz(double __x, double __y) {
|
||||
return __ocml_mul_rtz_f64(__x, __y);
|
||||
}
|
||||
#else
|
||||
__DEVICE__
|
||||
inline double __drcp_rd(double __x) { return __llvm_amdgcn_rcp_f64(__x); }
|
||||
double __dmul_rn(double __x, double __y) { return __x * __y; }
|
||||
#endif
|
||||
__DEVICE__
|
||||
inline double __drcp_rn(double __x) { return __llvm_amdgcn_rcp_f64(__x); }
|
||||
|
||||
#if defined OCML_BASIC_ROUNDED_OPERATIONS
|
||||
__DEVICE__
|
||||
inline double __drcp_ru(double __x) { return __llvm_amdgcn_rcp_f64(__x); }
|
||||
double __drcp_rd(double __x) { return __ocml_div_rtn_f64(1.0, __x); }
|
||||
__DEVICE__
|
||||
inline double __drcp_rz(double __x) { return __llvm_amdgcn_rcp_f64(__x); }
|
||||
double __drcp_rn(double __x) { return __ocml_div_rte_f64(1.0, __x); }
|
||||
__DEVICE__
|
||||
inline double __dsqrt_rd(double __x) { return __ocml_sqrt_rtn_f64(__x); }
|
||||
double __drcp_ru(double __x) { return __ocml_div_rtp_f64(1.0, __x); }
|
||||
__DEVICE__
|
||||
double __drcp_rz(double __x) { return __ocml_div_rtz_f64(1.0, __x); }
|
||||
#else
|
||||
__DEVICE__
|
||||
double __drcp_rn(double __x) { return 1.0 / __x; }
|
||||
#endif
|
||||
__DEVICE__
|
||||
inline double __dsqrt_rn(double __x) { return __ocml_sqrt_f64(__x); }
|
||||
|
||||
#if defined OCML_BASIC_ROUNDED_OPERATIONS
|
||||
__DEVICE__
|
||||
inline double __dsqrt_ru(double __x) { return __ocml_sqrt_rtp_f64(__x); }
|
||||
double __dsqrt_rd(double __x) { return __ocml_sqrt_rtn_f64(__x); }
|
||||
__DEVICE__
|
||||
inline double __dsqrt_rz(double __x) { return __ocml_sqrt_rtz_f64(__x); }
|
||||
double __dsqrt_rn(double __x) { return __ocml_sqrt_rte_f64(__x); }
|
||||
__DEVICE__
|
||||
inline double __dsub_rd(double __x, double __y) {
|
||||
double __dsqrt_ru(double __x) { return __ocml_sqrt_rtp_f64(__x); }
|
||||
__DEVICE__
|
||||
double __dsqrt_rz(double __x) { return __ocml_sqrt_rtz_f64(__x); }
|
||||
#else
|
||||
__DEVICE__
|
||||
double __dsqrt_rn(double __x) { return __ocml_sqrt_f64(__x); }
|
||||
#endif
|
||||
|
||||
#if defined OCML_BASIC_ROUNDED_OPERATIONS
|
||||
__DEVICE__
|
||||
double __dsub_rd(double __x, double __y) {
|
||||
return __ocml_sub_rtn_f64(__x, __y);
|
||||
}
|
||||
#endif
|
||||
__DEVICE__
|
||||
inline double __dsub_rn(double __x, double __y) { return __x - __y; }
|
||||
#if defined OCML_BASIC_ROUNDED_OPERATIONS
|
||||
double __dsub_rn(double __x, double __y) {
|
||||
return __ocml_sub_rte_f64(__x, __y);
|
||||
}
|
||||
__DEVICE__
|
||||
inline double __dsub_ru(double __x, double __y) {
|
||||
double __dsub_ru(double __x, double __y) {
|
||||
return __ocml_sub_rtp_f64(__x, __y);
|
||||
}
|
||||
__DEVICE__
|
||||
inline double __dsub_rz(double __x, double __y) {
|
||||
double __dsub_rz(double __x, double __y) {
|
||||
return __ocml_sub_rtz_f64(__x, __y);
|
||||
}
|
||||
#else
|
||||
__DEVICE__
|
||||
inline double __fma_rd(double __x, double __y, double __z) {
|
||||
return __ocml_fma_rtn_f64(__x, __y, __z);
|
||||
}
|
||||
double __dsub_rn(double __x, double __y) { return __x - __y; }
|
||||
#endif
|
||||
__DEVICE__
|
||||
inline double __fma_rn(double __x, double __y, double __z) {
|
||||
return __ocml_fma_f64(__x, __y, __z);
|
||||
}
|
||||
|
||||
#if defined OCML_BASIC_ROUNDED_OPERATIONS
|
||||
__DEVICE__
|
||||
inline double __fma_ru(double __x, double __y, double __z) {
|
||||
double __fma_rd(double __x, double __y, double __z) {
|
||||
return __ocml_fma_rtn_f64(__x, __y, __z);
|
||||
}
|
||||
__DEVICE__
|
||||
double __fma_rn(double __x, double __y, double __z) {
|
||||
return __ocml_fma_rte_f64(__x, __y, __z);
|
||||
}
|
||||
__DEVICE__
|
||||
double __fma_ru(double __x, double __y, double __z) {
|
||||
return __ocml_fma_rtp_f64(__x, __y, __z);
|
||||
}
|
||||
__DEVICE__
|
||||
inline double __fma_rz(double __x, double __y, double __z) {
|
||||
double __fma_rz(double __x, double __y, double __z) {
|
||||
return __ocml_fma_rtz_f64(__x, __y, __z);
|
||||
}
|
||||
#else
|
||||
__DEVICE__
|
||||
double __fma_rn(double __x, double __y, double __z) {
|
||||
return __ocml_fma_f64(__x, __y, __z);
|
||||
}
|
||||
#endif
|
||||
// END INTRINSICS
|
||||
// END DOUBLE
|
||||
|
||||
// BEGIN INTEGER
|
||||
__DEVICE__
|
||||
inline int abs(int __x) {
|
||||
int __sgn = __x >> (sizeof(int) * CHAR_BIT - 1);
|
||||
return (__x ^ __sgn) - __sgn;
|
||||
}
|
||||
__DEVICE__
|
||||
inline long labs(long __x) {
|
||||
long __sgn = __x >> (sizeof(long) * CHAR_BIT - 1);
|
||||
return (__x ^ __sgn) - __sgn;
|
||||
}
|
||||
__DEVICE__
|
||||
inline long long llabs(long long __x) {
|
||||
long long __sgn = __x >> (sizeof(long long) * CHAR_BIT - 1);
|
||||
return (__x ^ __sgn) - __sgn;
|
||||
}
|
||||
// C only macros
|
||||
#if !defined(__cplusplus) && __STDC_VERSION__ >= 201112L
|
||||
#define isfinite(__x) _Generic((__x), float : __finitef, double : __finite)(__x)
|
||||
#define isinf(__x) _Generic((__x), float : __isinff, double : __isinf)(__x)
|
||||
#define isnan(__x) _Generic((__x), float : __isnanf, double : __isnan)(__x)
|
||||
#define signbit(__x) \
|
||||
_Generic((__x), float : __signbitf, double : __signbit)(__x)
|
||||
#endif // !defined(__cplusplus) && __STDC_VERSION__ >= 201112L
|
||||
|
||||
#if defined(__cplusplus)
|
||||
__DEVICE__
|
||||
inline long abs(long __x) { return labs(__x); }
|
||||
__DEVICE__
|
||||
inline long long abs(long long __x) { return llabs(__x); }
|
||||
#endif
|
||||
// END INTEGER
|
||||
|
||||
__DEVICE__
|
||||
inline _Float16 fma(_Float16 __x, _Float16 __y, _Float16 __z) {
|
||||
return __ocml_fma_f16(__x, __y, __z);
|
||||
}
|
||||
|
||||
__DEVICE__
|
||||
inline float fma(float __x, float __y, float __z) {
|
||||
return fmaf(__x, __y, __z);
|
||||
}
|
||||
|
||||
#pragma push_macro("__DEF_FUN1")
|
||||
#pragma push_macro("__DEF_FUN2")
|
||||
#pragma push_macro("__DEF_FUNI")
|
||||
#pragma push_macro("__DEF_FLOAT_FUN2I")
|
||||
#pragma push_macro("__HIP_OVERLOAD1")
|
||||
#pragma push_macro("__HIP_OVERLOAD2")
|
||||
|
||||
// __hip_enable_if::type is a type function which returns __T if __B is true.
|
||||
template <bool __B, class __T = void> struct __hip_enable_if {};
|
||||
|
||||
template <class __T> struct __hip_enable_if<true, __T> { typedef __T type; };
|
||||
|
||||
// __HIP_OVERLOAD1 is used to resolve function calls with integer argument to
|
||||
// avoid compilation error due to ambibuity. e.g. floor(5) is resolved with
|
||||
// floor(double).
|
||||
#define __HIP_OVERLOAD1(__retty, __fn) \
|
||||
template <typename __T> \
|
||||
__DEVICE__ typename __hip_enable_if<std::numeric_limits<__T>::is_integer, \
|
||||
__retty>::type \
|
||||
__fn(__T __x) { \
|
||||
return ::__fn((double)__x); \
|
||||
}
|
||||
|
||||
// __HIP_OVERLOAD2 is used to resolve function calls with mixed float/double
|
||||
// or integer argument to avoid compilation error due to ambibuity. e.g.
|
||||
// max(5.0f, 6.0) is resolved with max(double, double).
|
||||
#define __HIP_OVERLOAD2(__retty, __fn) \
|
||||
template <typename __T1, typename __T2> \
|
||||
__DEVICE__ \
|
||||
typename __hip_enable_if<std::numeric_limits<__T1>::is_specialized && \
|
||||
std::numeric_limits<__T2>::is_specialized, \
|
||||
__retty>::type \
|
||||
__fn(__T1 __x, __T2 __y) { \
|
||||
return __fn((double)__x, (double)__y); \
|
||||
}
|
||||
|
||||
// Define cmath functions with float argument and returns float.
|
||||
#define __DEF_FUN1(__retty, __func) \
|
||||
__DEVICE__ \
|
||||
inline float __func(float __x) { return __func##f(__x); } \
|
||||
__HIP_OVERLOAD1(__retty, __func)
|
||||
|
||||
// Define cmath functions with float argument and returns __retty.
|
||||
#define __DEF_FUNI(__retty, __func) \
|
||||
__DEVICE__ \
|
||||
inline __retty __func(float __x) { return __func##f(__x); } \
|
||||
__HIP_OVERLOAD1(__retty, __func)
|
||||
|
||||
// define cmath functions with two float arguments.
|
||||
#define __DEF_FUN2(__retty, __func) \
|
||||
__DEVICE__ \
|
||||
inline float __func(float __x, float __y) { return __func##f(__x, __y); } \
|
||||
__HIP_OVERLOAD2(__retty, __func)
|
||||
|
||||
__DEF_FUN1(double, acos)
|
||||
__DEF_FUN1(double, acosh)
|
||||
__DEF_FUN1(double, asin)
|
||||
__DEF_FUN1(double, asinh)
|
||||
__DEF_FUN1(double, atan)
|
||||
__DEF_FUN2(double, atan2);
|
||||
__DEF_FUN1(double, atanh)
|
||||
__DEF_FUN1(double, cbrt)
|
||||
__DEF_FUN1(double, ceil)
|
||||
__DEF_FUN2(double, copysign);
|
||||
__DEF_FUN1(double, cos)
|
||||
__DEF_FUN1(double, cosh)
|
||||
__DEF_FUN1(double, erf)
|
||||
__DEF_FUN1(double, erfc)
|
||||
__DEF_FUN1(double, exp)
|
||||
__DEF_FUN1(double, exp2)
|
||||
__DEF_FUN1(double, expm1)
|
||||
__DEF_FUN1(double, fabs)
|
||||
__DEF_FUN2(double, fdim);
|
||||
__DEF_FUN1(double, floor)
|
||||
__DEF_FUN2(double, fmax);
|
||||
__DEF_FUN2(double, fmin);
|
||||
__DEF_FUN2(double, fmod);
|
||||
//__HIP_OVERLOAD1(int, fpclassify)
|
||||
__DEF_FUN2(double, hypot);
|
||||
__DEF_FUNI(int, ilogb)
|
||||
__HIP_OVERLOAD1(bool, isfinite)
|
||||
__HIP_OVERLOAD2(bool, isgreater);
|
||||
__HIP_OVERLOAD2(bool, isgreaterequal);
|
||||
__HIP_OVERLOAD1(bool, isinf);
|
||||
__HIP_OVERLOAD2(bool, isless);
|
||||
__HIP_OVERLOAD2(bool, islessequal);
|
||||
__HIP_OVERLOAD2(bool, islessgreater);
|
||||
__HIP_OVERLOAD1(bool, isnan);
|
||||
//__HIP_OVERLOAD1(bool, isnormal)
|
||||
__HIP_OVERLOAD2(bool, isunordered);
|
||||
__DEF_FUN1(double, lgamma)
|
||||
__DEF_FUN1(double, log)
|
||||
__DEF_FUN1(double, log10)
|
||||
__DEF_FUN1(double, log1p)
|
||||
__DEF_FUN1(double, log2)
|
||||
__DEF_FUN1(double, logb)
|
||||
__DEF_FUNI(long long, llrint)
|
||||
__DEF_FUNI(long long, llround)
|
||||
__DEF_FUNI(long, lrint)
|
||||
__DEF_FUNI(long, lround)
|
||||
__DEF_FUN1(double, nearbyint);
|
||||
__DEF_FUN2(double, nextafter);
|
||||
__DEF_FUN2(double, pow);
|
||||
__DEF_FUN2(double, remainder);
|
||||
__DEF_FUN1(double, rint);
|
||||
__DEF_FUN1(double, round);
|
||||
__HIP_OVERLOAD1(bool, signbit)
|
||||
__DEF_FUN1(double, sin)
|
||||
__DEF_FUN1(double, sinh)
|
||||
__DEF_FUN1(double, sqrt)
|
||||
__DEF_FUN1(double, tan)
|
||||
__DEF_FUN1(double, tanh)
|
||||
__DEF_FUN1(double, tgamma)
|
||||
__DEF_FUN1(double, trunc);
|
||||
|
||||
// define cmath functions with a float and an integer argument.
|
||||
#define __DEF_FLOAT_FUN2I(__func) \
|
||||
__DEVICE__ \
|
||||
inline float __func(float __x, int __y) { return __func##f(__x, __y); }
|
||||
__DEF_FLOAT_FUN2I(scalbn)
|
||||
|
||||
template <class T> __DEVICE__ inline T min(T __arg1, T __arg2) {
|
||||
template <class T> __DEVICE__ T min(T __arg1, T __arg2) {
|
||||
return (__arg1 < __arg2) ? __arg1 : __arg2;
|
||||
}
|
||||
|
||||
template <class T> __DEVICE__ inline T max(T __arg1, T __arg2) {
|
||||
template <class T> __DEVICE__ T max(T __arg1, T __arg2) {
|
||||
return (__arg1 > __arg2) ? __arg1 : __arg2;
|
||||
}
|
||||
|
||||
__DEVICE__ inline int min(int __arg1, int __arg2) {
|
||||
__DEVICE__ int min(int __arg1, int __arg2) {
|
||||
return (__arg1 < __arg2) ? __arg1 : __arg2;
|
||||
}
|
||||
__DEVICE__ inline int max(int __arg1, int __arg2) {
|
||||
__DEVICE__ int max(int __arg1, int __arg2) {
|
||||
return (__arg1 > __arg2) ? __arg1 : __arg2;
|
||||
}
|
||||
|
||||
__DEVICE__
|
||||
inline float max(float __x, float __y) { return fmaxf(__x, __y); }
|
||||
float max(float __x, float __y) { return fmaxf(__x, __y); }
|
||||
|
||||
__DEVICE__
|
||||
inline double max(double __x, double __y) { return fmax(__x, __y); }
|
||||
double max(double __x, double __y) { return fmax(__x, __y); }
|
||||
|
||||
__DEVICE__
|
||||
inline float min(float __x, float __y) { return fminf(__x, __y); }
|
||||
float min(float __x, float __y) { return fminf(__x, __y); }
|
||||
|
||||
__DEVICE__
|
||||
inline double min(double __x, double __y) { return fmin(__x, __y); }
|
||||
|
||||
__HIP_OVERLOAD2(double, max)
|
||||
__HIP_OVERLOAD2(double, min)
|
||||
double min(double __x, double __y) { return fmin(__x, __y); }
|
||||
|
||||
__host__ inline static int min(int __arg1, int __arg2) {
|
||||
return std::min(__arg1, __arg2);
|
||||
@@ -1172,13 +1267,8 @@ __host__ inline static int min(int __arg1, int __arg2) {
|
||||
__host__ inline static int max(int __arg1, int __arg2) {
|
||||
return std::max(__arg1, __arg2);
|
||||
}
|
||||
#endif
|
||||
|
||||
#pragma pop_macro("__DEF_FUN1")
|
||||
#pragma pop_macro("__DEF_FUN2")
|
||||
#pragma pop_macro("__DEF_FUNI")
|
||||
#pragma pop_macro("__DEF_FLOAT_FUN2I")
|
||||
#pragma pop_macro("__HIP_OVERLOAD1")
|
||||
#pragma pop_macro("__HIP_OVERLOAD2")
|
||||
#pragma pop_macro("__DEVICE__")
|
||||
#pragma pop_macro("__RETURN_TYPE")
|
||||
|
||||
|
||||
+5
@@ -28,6 +28,10 @@
|
||||
#define __shared__ __attribute__((shared))
|
||||
#define __constant__ __attribute__((constant))
|
||||
|
||||
#if !defined(__cplusplus) || __cplusplus < 201103L
|
||||
#define nullptr NULL;
|
||||
#endif
|
||||
|
||||
#if __HIP_ENABLE_DEVICE_MALLOC__
|
||||
extern "C" __device__ void *__hip_malloc(size_t __size);
|
||||
extern "C" __device__ void *__hip_free(void *__ptr);
|
||||
@@ -51,6 +55,7 @@ static inline __device__ void *free(void *__ptr) {
|
||||
|
||||
#if !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
|
||||
#include <__clang_cuda_math_forward_declares.h>
|
||||
#include <__clang_hip_cmath.h>
|
||||
#include <__clang_cuda_complex_builtins.h>
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
Vendored
+1100
-69
@@ -1709,6 +1709,20 @@ vec_cmpeq(vector double __a, vector double __b) {
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __POWER10_VECTOR__
|
||||
static __inline__ vector bool __int128 __ATTRS_o_ai
|
||||
vec_cmpeq(vector signed __int128 __a, vector signed __int128 __b) {
|
||||
return (vector bool __int128)__builtin_altivec_vcmpequq(
|
||||
(vector bool __int128)__a, (vector bool __int128)__b);
|
||||
}
|
||||
|
||||
static __inline__ vector bool __int128 __ATTRS_o_ai
|
||||
vec_cmpeq(vector unsigned __int128 __a, vector unsigned __int128 __b) {
|
||||
return (vector bool __int128)__builtin_altivec_vcmpequq(
|
||||
(vector bool __int128)__a, (vector bool __int128)__b);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __POWER9_VECTOR__
|
||||
/* vec_cmpne */
|
||||
|
||||
@@ -1766,36 +1780,26 @@ vec_cmpne(vector unsigned int __a, vector unsigned int __b) {
|
||||
(vector int)__b);
|
||||
}
|
||||
|
||||
static __inline__ vector bool long long __ATTRS_o_ai
|
||||
vec_cmpne(vector bool long long __a, vector bool long long __b) {
|
||||
return (vector bool long long)
|
||||
~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b));
|
||||
}
|
||||
|
||||
static __inline__ vector bool long long __ATTRS_o_ai
|
||||
vec_cmpne(vector signed long long __a, vector signed long long __b) {
|
||||
return (vector bool long long)
|
||||
~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b));
|
||||
}
|
||||
|
||||
static __inline__ vector bool long long __ATTRS_o_ai
|
||||
vec_cmpne(vector unsigned long long __a, vector unsigned long long __b) {
|
||||
return (vector bool long long)
|
||||
~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b));
|
||||
}
|
||||
|
||||
static __inline__ vector bool int __ATTRS_o_ai
|
||||
vec_cmpne(vector float __a, vector float __b) {
|
||||
return (vector bool int)__builtin_altivec_vcmpnew((vector int)__a,
|
||||
(vector int)__b);
|
||||
}
|
||||
|
||||
static __inline__ vector bool long long __ATTRS_o_ai
|
||||
vec_cmpne(vector double __a, vector double __b) {
|
||||
return (vector bool long long)
|
||||
~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b));
|
||||
#ifdef __POWER10_VECTOR__
|
||||
static __inline__ vector bool __int128 __ATTRS_o_ai
|
||||
vec_cmpne(vector unsigned __int128 __a, vector unsigned __int128 __b) {
|
||||
return (vector bool __int128) ~(__builtin_altivec_vcmpequq(
|
||||
(vector bool __int128)__a, (vector bool __int128)__b));
|
||||
}
|
||||
|
||||
static __inline__ vector bool __int128 __ATTRS_o_ai
|
||||
vec_cmpne(vector signed __int128 __a, vector signed __int128 __b) {
|
||||
return (vector bool __int128) ~(__builtin_altivec_vcmpequq(
|
||||
(vector bool __int128)__a, (vector bool __int128)__b));
|
||||
}
|
||||
#endif
|
||||
|
||||
/* vec_cmpnez */
|
||||
|
||||
static __inline__ vector bool char __ATTRS_o_ai
|
||||
@@ -1900,6 +1904,86 @@ vec_parity_lsbb(vector signed long long __a) {
|
||||
return __builtin_altivec_vprtybd(__a);
|
||||
}
|
||||
|
||||
#else
|
||||
/* vec_cmpne */
|
||||
|
||||
static __inline__ vector bool char __ATTRS_o_ai
|
||||
vec_cmpne(vector bool char __a, vector bool char __b) {
|
||||
return ~(vec_cmpeq(__a, __b));
|
||||
}
|
||||
|
||||
static __inline__ vector bool char __ATTRS_o_ai
|
||||
vec_cmpne(vector signed char __a, vector signed char __b) {
|
||||
return ~(vec_cmpeq(__a, __b));
|
||||
}
|
||||
|
||||
static __inline__ vector bool char __ATTRS_o_ai
|
||||
vec_cmpne(vector unsigned char __a, vector unsigned char __b) {
|
||||
return ~(vec_cmpeq(__a, __b));
|
||||
}
|
||||
|
||||
static __inline__ vector bool short __ATTRS_o_ai
|
||||
vec_cmpne(vector bool short __a, vector bool short __b) {
|
||||
return ~(vec_cmpeq(__a, __b));
|
||||
}
|
||||
|
||||
static __inline__ vector bool short __ATTRS_o_ai
|
||||
vec_cmpne(vector signed short __a, vector signed short __b) {
|
||||
return ~(vec_cmpeq(__a, __b));
|
||||
}
|
||||
|
||||
static __inline__ vector bool short __ATTRS_o_ai
|
||||
vec_cmpne(vector unsigned short __a, vector unsigned short __b) {
|
||||
return ~(vec_cmpeq(__a, __b));
|
||||
}
|
||||
|
||||
static __inline__ vector bool int __ATTRS_o_ai
|
||||
vec_cmpne(vector bool int __a, vector bool int __b) {
|
||||
return ~(vec_cmpeq(__a, __b));
|
||||
}
|
||||
|
||||
static __inline__ vector bool int __ATTRS_o_ai
|
||||
vec_cmpne(vector signed int __a, vector signed int __b) {
|
||||
return ~(vec_cmpeq(__a, __b));
|
||||
}
|
||||
|
||||
static __inline__ vector bool int __ATTRS_o_ai
|
||||
vec_cmpne(vector unsigned int __a, vector unsigned int __b) {
|
||||
return ~(vec_cmpeq(__a, __b));
|
||||
}
|
||||
|
||||
static __inline__ vector bool int __ATTRS_o_ai
|
||||
vec_cmpne(vector float __a, vector float __b) {
|
||||
return ~(vec_cmpeq(__a, __b));
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __POWER8_VECTOR__
|
||||
static __inline__ vector bool long long __ATTRS_o_ai
|
||||
vec_cmpne(vector bool long long __a, vector bool long long __b) {
|
||||
return (vector bool long long)
|
||||
~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b));
|
||||
}
|
||||
|
||||
static __inline__ vector bool long long __ATTRS_o_ai
|
||||
vec_cmpne(vector signed long long __a, vector signed long long __b) {
|
||||
return (vector bool long long)
|
||||
~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b));
|
||||
}
|
||||
|
||||
static __inline__ vector bool long long __ATTRS_o_ai
|
||||
vec_cmpne(vector unsigned long long __a, vector unsigned long long __b) {
|
||||
return (vector bool long long)
|
||||
~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b));
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __VSX__
|
||||
static __inline__ vector bool long long __ATTRS_o_ai
|
||||
vec_cmpne(vector double __a, vector double __b) {
|
||||
return (vector bool long long)
|
||||
~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b));
|
||||
}
|
||||
#endif
|
||||
|
||||
/* vec_cmpgt */
|
||||
@@ -1962,6 +2046,20 @@ vec_cmpgt(vector double __a, vector double __b) {
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __POWER10_VECTOR__
|
||||
static __inline__ vector bool __int128 __ATTRS_o_ai
|
||||
vec_cmpgt(vector signed __int128 __a, vector signed __int128 __b) {
|
||||
return (vector bool __int128)__builtin_altivec_vcmpgtsq(
|
||||
(vector bool __int128)__a, (vector bool __int128)__b);
|
||||
}
|
||||
|
||||
static __inline__ vector bool __int128 __ATTRS_o_ai
|
||||
vec_cmpgt(vector unsigned __int128 __a, vector unsigned __int128 __b) {
|
||||
return (vector bool __int128)__builtin_altivec_vcmpgtuq(
|
||||
(vector bool __int128)__a, (vector bool __int128)__b);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* vec_cmpge */
|
||||
|
||||
static __inline__ vector bool char __ATTRS_o_ai
|
||||
@@ -2022,6 +2120,18 @@ vec_cmpge(vector unsigned long long __a, vector unsigned long long __b) {
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __POWER10_VECTOR__
|
||||
static __inline__ vector bool __int128 __ATTRS_o_ai
|
||||
vec_cmpge(vector signed __int128 __a, vector signed __int128 __b) {
|
||||
return ~(vec_cmpgt(__b, __a));
|
||||
}
|
||||
|
||||
static __inline__ vector bool __int128 __ATTRS_o_ai
|
||||
vec_cmpge(vector unsigned __int128 __a, vector unsigned __int128 __b) {
|
||||
return ~(vec_cmpgt(__b, __a));
|
||||
}
|
||||
#endif
|
||||
|
||||
/* vec_vcmpgefp */
|
||||
|
||||
static __inline__ vector bool int __attribute__((__always_inline__))
|
||||
@@ -2134,6 +2244,18 @@ vec_cmple(vector unsigned long long __a, vector unsigned long long __b) {
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __POWER10_VECTOR__
|
||||
static __inline__ vector bool __int128 __ATTRS_o_ai
|
||||
vec_cmple(vector signed __int128 __a, vector signed __int128 __b) {
|
||||
return vec_cmpge(__b, __a);
|
||||
}
|
||||
|
||||
static __inline__ vector bool __int128 __ATTRS_o_ai
|
||||
vec_cmple(vector unsigned __int128 __a, vector unsigned __int128 __b) {
|
||||
return vec_cmpge(__b, __a);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* vec_cmplt */
|
||||
|
||||
static __inline__ vector bool char __ATTRS_o_ai
|
||||
@@ -2178,6 +2300,18 @@ vec_cmplt(vector double __a, vector double __b) {
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __POWER10_VECTOR__
|
||||
static __inline__ vector bool __int128 __ATTRS_o_ai
|
||||
vec_cmplt(vector signed __int128 __a, vector signed __int128 __b) {
|
||||
return vec_cmpgt(__b, __a);
|
||||
}
|
||||
|
||||
static __inline__ vector bool __int128 __ATTRS_o_ai
|
||||
vec_cmplt(vector unsigned __int128 __a, vector unsigned __int128 __b) {
|
||||
return vec_cmpgt(__b, __a);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __POWER8_VECTOR__
|
||||
static __inline__ vector bool long long __ATTRS_o_ai
|
||||
vec_cmplt(vector signed long long __a, vector signed long long __b) {
|
||||
@@ -2702,67 +2836,67 @@ vec_insert_exp(vector unsigned int __a, vector unsigned int __b) {
|
||||
}
|
||||
|
||||
#if defined(__powerpc64__)
|
||||
static __inline__ vector signed char __ATTRS_o_ai vec_xl_len(signed char *__a,
|
||||
static __inline__ vector signed char __ATTRS_o_ai vec_xl_len(const signed char *__a,
|
||||
size_t __b) {
|
||||
return (vector signed char)__builtin_vsx_lxvl(__a, (__b << 56));
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned char __ATTRS_o_ai
|
||||
vec_xl_len(unsigned char *__a, size_t __b) {
|
||||
vec_xl_len(const unsigned char *__a, size_t __b) {
|
||||
return (vector unsigned char)__builtin_vsx_lxvl(__a, (__b << 56));
|
||||
}
|
||||
|
||||
static __inline__ vector signed short __ATTRS_o_ai vec_xl_len(signed short *__a,
|
||||
static __inline__ vector signed short __ATTRS_o_ai vec_xl_len(const signed short *__a,
|
||||
size_t __b) {
|
||||
return (vector signed short)__builtin_vsx_lxvl(__a, (__b << 56));
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned short __ATTRS_o_ai
|
||||
vec_xl_len(unsigned short *__a, size_t __b) {
|
||||
vec_xl_len(const unsigned short *__a, size_t __b) {
|
||||
return (vector unsigned short)__builtin_vsx_lxvl(__a, (__b << 56));
|
||||
}
|
||||
|
||||
static __inline__ vector signed int __ATTRS_o_ai vec_xl_len(signed int *__a,
|
||||
static __inline__ vector signed int __ATTRS_o_ai vec_xl_len(const signed int *__a,
|
||||
size_t __b) {
|
||||
return (vector signed int)__builtin_vsx_lxvl(__a, (__b << 56));
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned int __ATTRS_o_ai vec_xl_len(unsigned int *__a,
|
||||
static __inline__ vector unsigned int __ATTRS_o_ai vec_xl_len(const unsigned int *__a,
|
||||
size_t __b) {
|
||||
return (vector unsigned int)__builtin_vsx_lxvl(__a, (__b << 56));
|
||||
}
|
||||
|
||||
static __inline__ vector float __ATTRS_o_ai vec_xl_len(float *__a, size_t __b) {
|
||||
static __inline__ vector float __ATTRS_o_ai vec_xl_len(const float *__a, size_t __b) {
|
||||
return (vector float)__builtin_vsx_lxvl(__a, (__b << 56));
|
||||
}
|
||||
|
||||
static __inline__ vector signed __int128 __ATTRS_o_ai
|
||||
vec_xl_len(signed __int128 *__a, size_t __b) {
|
||||
vec_xl_len(const signed __int128 *__a, size_t __b) {
|
||||
return (vector signed __int128)__builtin_vsx_lxvl(__a, (__b << 56));
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned __int128 __ATTRS_o_ai
|
||||
vec_xl_len(unsigned __int128 *__a, size_t __b) {
|
||||
vec_xl_len(const unsigned __int128 *__a, size_t __b) {
|
||||
return (vector unsigned __int128)__builtin_vsx_lxvl(__a, (__b << 56));
|
||||
}
|
||||
|
||||
static __inline__ vector signed long long __ATTRS_o_ai
|
||||
vec_xl_len(signed long long *__a, size_t __b) {
|
||||
vec_xl_len(const signed long long *__a, size_t __b) {
|
||||
return (vector signed long long)__builtin_vsx_lxvl(__a, (__b << 56));
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned long long __ATTRS_o_ai
|
||||
vec_xl_len(unsigned long long *__a, size_t __b) {
|
||||
vec_xl_len(const unsigned long long *__a, size_t __b) {
|
||||
return (vector unsigned long long)__builtin_vsx_lxvl(__a, (__b << 56));
|
||||
}
|
||||
|
||||
static __inline__ vector double __ATTRS_o_ai vec_xl_len(double *__a,
|
||||
static __inline__ vector double __ATTRS_o_ai vec_xl_len(const double *__a,
|
||||
size_t __b) {
|
||||
return (vector double)__builtin_vsx_lxvl(__a, (__b << 56));
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned char __ATTRS_o_ai
|
||||
vec_xl_len_r(unsigned char *__a, size_t __b) {
|
||||
vec_xl_len_r(const unsigned char *__a, size_t __b) {
|
||||
vector unsigned char __res =
|
||||
(vector unsigned char)__builtin_vsx_lxvll(__a, (__b << 56));
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
@@ -2862,12 +2996,12 @@ static __inline__ void __ATTRS_o_ai vec_xst_len_r(vector unsigned char __a,
|
||||
#ifdef __VSX__
|
||||
static __inline__ vector float __ATTRS_o_ai vec_cpsgn(vector float __a,
|
||||
vector float __b) {
|
||||
return __builtin_vsx_xvcpsgnsp(__a, __b);
|
||||
return __builtin_vsx_xvcpsgnsp(__b, __a);
|
||||
}
|
||||
|
||||
static __inline__ vector double __ATTRS_o_ai vec_cpsgn(vector double __a,
|
||||
vector double __b) {
|
||||
return __builtin_vsx_xvcpsgndp(__a, __b);
|
||||
return __builtin_vsx_xvcpsgndp(__b, __a);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -2951,6 +3085,42 @@ static __inline__ vector double __ATTRS_o_ai vec_cpsgn(vector double __a,
|
||||
|
||||
#define vec_vctuxs __builtin_altivec_vctuxs
|
||||
|
||||
/* vec_signext */
|
||||
|
||||
#ifdef __POWER9_VECTOR__
|
||||
static __inline__ vector signed int __ATTRS_o_ai
|
||||
vec_signexti(vector signed char __a) {
|
||||
return __builtin_altivec_vextsb2w(__a);
|
||||
}
|
||||
|
||||
static __inline__ vector signed int __ATTRS_o_ai
|
||||
vec_signexti(vector signed short __a) {
|
||||
return __builtin_altivec_vextsh2w(__a);
|
||||
}
|
||||
|
||||
static __inline__ vector signed long long __ATTRS_o_ai
|
||||
vec_signextll(vector signed char __a) {
|
||||
return __builtin_altivec_vextsb2d(__a);
|
||||
}
|
||||
|
||||
static __inline__ vector signed long long __ATTRS_o_ai
|
||||
vec_signextll(vector signed short __a) {
|
||||
return __builtin_altivec_vextsh2d(__a);
|
||||
}
|
||||
|
||||
static __inline__ vector signed long long __ATTRS_o_ai
|
||||
vec_signextll(vector signed int __a) {
|
||||
return __builtin_altivec_vextsw2d(__a);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __POWER10_VECTOR__
|
||||
static __inline__ vector signed __int128 __ATTRS_o_ai
|
||||
vec_signextq(vector signed long long __a) {
|
||||
return __builtin_altivec_vextsd2q(__a);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* vec_signed */
|
||||
|
||||
static __inline__ vector signed int __ATTRS_o_ai
|
||||
@@ -3288,6 +3458,66 @@ static __inline__ vector double __ATTRS_o_ai vec_div(vector double __a,
|
||||
}
|
||||
#endif
|
||||
|
||||
/* vec_dive */
|
||||
|
||||
#ifdef __POWER10_VECTOR__
|
||||
static __inline__ vector signed int __ATTRS_o_ai
|
||||
vec_dive(vector signed int __a, vector signed int __b) {
|
||||
return __builtin_altivec_vdivesw(__a, __b);
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned int __ATTRS_o_ai
|
||||
vec_dive(vector unsigned int __a, vector unsigned int __b) {
|
||||
return __builtin_altivec_vdiveuw(__a, __b);
|
||||
}
|
||||
|
||||
static __inline__ vector signed long long __ATTRS_o_ai
|
||||
vec_dive(vector signed long long __a, vector signed long long __b) {
|
||||
return __builtin_altivec_vdivesd(__a, __b);
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned long long __ATTRS_o_ai
|
||||
vec_dive(vector unsigned long long __a, vector unsigned long long __b) {
|
||||
return __builtin_altivec_vdiveud(__a, __b);
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned __int128 __ATTRS_o_ai
|
||||
vec_dive(vector unsigned __int128 __a, vector unsigned __int128 __b) {
|
||||
return __builtin_altivec_vdiveuq(__a, __b);
|
||||
}
|
||||
|
||||
static __inline__ vector signed __int128 __ATTRS_o_ai
|
||||
vec_dive(vector signed __int128 __a, vector signed __int128 __b) {
|
||||
return __builtin_altivec_vdivesq(__a, __b);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __POWER10_VECTOR__
|
||||
static __inline__ vector unsigned __int128 __ATTRS_o_ai
|
||||
vec_div(vector unsigned __int128 __a, vector unsigned __int128 __b) {
|
||||
return __a / __b;
|
||||
}
|
||||
|
||||
static __inline__ vector signed __int128 __ATTRS_o_ai
|
||||
vec_div(vector signed __int128 __a, vector signed __int128 __b) {
|
||||
return __a / __b;
|
||||
}
|
||||
#endif /* __POWER10_VECTOR__ */
|
||||
|
||||
/* vec_xvtdiv */
|
||||
|
||||
#ifdef __VSX__
|
||||
static __inline__ int __ATTRS_o_ai vec_test_swdiv(vector double __a,
|
||||
vector double __b) {
|
||||
return __builtin_vsx_xvtdivdp(__a, __b);
|
||||
}
|
||||
|
||||
static __inline__ int __ATTRS_o_ai vec_test_swdivs(vector float __a,
|
||||
vector float __b) {
|
||||
return __builtin_vsx_xvtdivsp(__a, __b);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* vec_dss */
|
||||
|
||||
#define vec_dss __builtin_altivec_dss
|
||||
@@ -3300,23 +3530,19 @@ static __inline__ void __attribute__((__always_inline__)) vec_dssall(void) {
|
||||
|
||||
/* vec_dst */
|
||||
#define vec_dst(__PTR, __CW, __STR) \
|
||||
__extension__( \
|
||||
{ __builtin_altivec_dst((const void *)(__PTR), (__CW), (__STR)); })
|
||||
__builtin_altivec_dst((const void *)(__PTR), (__CW), (__STR))
|
||||
|
||||
/* vec_dstst */
|
||||
#define vec_dstst(__PTR, __CW, __STR) \
|
||||
__extension__( \
|
||||
{ __builtin_altivec_dstst((const void *)(__PTR), (__CW), (__STR)); })
|
||||
__builtin_altivec_dstst((const void *)(__PTR), (__CW), (__STR))
|
||||
|
||||
/* vec_dststt */
|
||||
#define vec_dststt(__PTR, __CW, __STR) \
|
||||
__extension__( \
|
||||
{ __builtin_altivec_dststt((const void *)(__PTR), (__CW), (__STR)); })
|
||||
__builtin_altivec_dststt((const void *)(__PTR), (__CW), (__STR))
|
||||
|
||||
/* vec_dstt */
|
||||
#define vec_dstt(__PTR, __CW, __STR) \
|
||||
__extension__( \
|
||||
{ __builtin_altivec_dstt((const void *)(__PTR), (__CW), (__STR)); })
|
||||
__builtin_altivec_dstt((const void *)(__PTR), (__CW), (__STR))
|
||||
|
||||
/* vec_eqv */
|
||||
|
||||
@@ -5467,6 +5693,16 @@ vec_msum(vector unsigned short __a, vector unsigned short __b,
|
||||
return __builtin_altivec_vmsumuhm(__a, __b, __c);
|
||||
}
|
||||
|
||||
/* vec_msumc */
|
||||
|
||||
#ifdef __POWER10_VECTOR__
|
||||
static __inline__ vector unsigned __int128 __ATTRS_o_ai
|
||||
vec_msumc(vector unsigned long long __a, vector unsigned long long __b,
|
||||
vector unsigned __int128 __c) {
|
||||
return __builtin_altivec_vmsumcud(__a, __b, __c);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* vec_vmsummbm */
|
||||
|
||||
static __inline__ vector int __attribute__((__always_inline__))
|
||||
@@ -5693,6 +5929,26 @@ vec_mule(vector unsigned int __a, vector unsigned int __b) {
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __POWER10_VECTOR__
|
||||
static __inline__ vector signed __int128 __ATTRS_o_ai
|
||||
vec_mule(vector signed long long __a, vector signed long long __b) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vmulosd(__a, __b);
|
||||
#else
|
||||
return __builtin_altivec_vmulesd(__a, __b);
|
||||
#endif
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned __int128 __ATTRS_o_ai
|
||||
vec_mule(vector unsigned long long __a, vector unsigned long long __b) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vmuloud(__a, __b);
|
||||
#else
|
||||
return __builtin_altivec_vmuleud(__a, __b);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
/* vec_vmulesb */
|
||||
|
||||
static __inline__ vector short __attribute__((__always_inline__))
|
||||
@@ -5737,6 +5993,30 @@ vec_vmuleuh(vector unsigned short __a, vector unsigned short __b) {
|
||||
#endif
|
||||
}
|
||||
|
||||
/* vec_mulh */
|
||||
|
||||
#ifdef __POWER10_VECTOR__
|
||||
static __inline__ vector signed int __ATTRS_o_ai
|
||||
vec_mulh(vector signed int __a, vector signed int __b) {
|
||||
return __builtin_altivec_vmulhsw(__a, __b);
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned int __ATTRS_o_ai
|
||||
vec_mulh(vector unsigned int __a, vector unsigned int __b) {
|
||||
return __builtin_altivec_vmulhuw(__a, __b);
|
||||
}
|
||||
|
||||
static __inline__ vector signed long long __ATTRS_o_ai
|
||||
vec_mulh(vector signed long long __a, vector signed long long __b) {
|
||||
return __builtin_altivec_vmulhsd(__a, __b);
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned long long __ATTRS_o_ai
|
||||
vec_mulh(vector unsigned long long __a, vector unsigned long long __b) {
|
||||
return __builtin_altivec_vmulhud(__a, __b);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* vec_mulo */
|
||||
|
||||
static __inline__ vector short __ATTRS_o_ai vec_mulo(vector signed char __a,
|
||||
@@ -5795,6 +6075,26 @@ vec_mulo(vector unsigned int __a, vector unsigned int __b) {
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __POWER10_VECTOR__
|
||||
static __inline__ vector signed __int128 __ATTRS_o_ai
|
||||
vec_mulo(vector signed long long __a, vector signed long long __b) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vmulesd(__a, __b);
|
||||
#else
|
||||
return __builtin_altivec_vmulosd(__a, __b);
|
||||
#endif
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned __int128 __ATTRS_o_ai
|
||||
vec_mulo(vector unsigned long long __a, vector unsigned long long __b) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vmuleud(__a, __b);
|
||||
#else
|
||||
return __builtin_altivec_vmuloud(__a, __b);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
/* vec_vmulosb */
|
||||
|
||||
static __inline__ vector short __attribute__((__always_inline__))
|
||||
@@ -7627,6 +7927,18 @@ vec_rl(vector unsigned long long __a, vector unsigned long long __b) {
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __POWER10_VECTOR__
|
||||
static __inline__ vector signed __int128 __ATTRS_o_ai
|
||||
vec_rl(vector signed __int128 __a, vector unsigned __int128 __b) {
|
||||
return (__b << __a)|(__b >> ((__CHAR_BIT__ * sizeof(vector signed __int128)) - __a));
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned __int128 __ATTRS_o_ai
|
||||
vec_rl(vector unsigned __int128 __a, vector unsigned __int128 __b) {
|
||||
return (__b << __a)|(__b >> ((__CHAR_BIT__ * sizeof(vector unsigned __int128)) - __a));
|
||||
}
|
||||
#endif
|
||||
|
||||
/* vec_rlmi */
|
||||
#ifdef __POWER9_VECTOR__
|
||||
static __inline__ vector unsigned int __ATTRS_o_ai
|
||||
@@ -7640,8 +7952,24 @@ vec_rlmi(vector unsigned long long __a, vector unsigned long long __b,
|
||||
vector unsigned long long __c) {
|
||||
return __builtin_altivec_vrldmi(__a, __c, __b);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __POWER10_VECTOR__
|
||||
static __inline__ vector unsigned __int128 __ATTRS_o_ai
|
||||
vec_rlmi(vector unsigned __int128 __a, vector unsigned __int128 __b,
|
||||
vector unsigned __int128 __c) {
|
||||
return __builtin_altivec_vrlqmi(__a, __c, __b);
|
||||
}
|
||||
|
||||
static __inline__ vector signed __int128 __ATTRS_o_ai
|
||||
vec_rlmi(vector signed __int128 __a, vector signed __int128 __b,
|
||||
vector signed __int128 __c) {
|
||||
return __builtin_altivec_vrlqmi(__a, __c, __b);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* vec_rlnm */
|
||||
#ifdef __POWER9_VECTOR__
|
||||
static __inline__ vector unsigned int __ATTRS_o_ai
|
||||
vec_rlnm(vector unsigned int __a, vector unsigned int __b,
|
||||
vector unsigned int __c) {
|
||||
@@ -7657,6 +7985,42 @@ vec_rlnm(vector unsigned long long __a, vector unsigned long long __b,
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __POWER10_VECTOR__
|
||||
static __inline__ vector unsigned __int128 __ATTRS_o_ai
|
||||
vec_rlnm(vector unsigned __int128 __a, vector unsigned __int128 __b,
|
||||
vector unsigned __int128 __c) {
|
||||
// Merge __b and __c using an appropriate shuffle.
|
||||
vector unsigned char TmpB = (vector unsigned char)__b;
|
||||
vector unsigned char TmpC = (vector unsigned char)__c;
|
||||
vector unsigned char MaskAndShift =
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
__builtin_shufflevector(TmpB, TmpC, -1, -1, -1, -1, -1, -1, -1, -1, 16, 0,
|
||||
1, -1, -1, -1, -1, -1);
|
||||
#else
|
||||
__builtin_shufflevector(TmpB, TmpC, -1, -1, -1, -1, -1, 31, 30, 15, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1);
|
||||
#endif
|
||||
return __builtin_altivec_vrlqnm(__a, (vector unsigned __int128) MaskAndShift);
|
||||
}
|
||||
|
||||
static __inline__ vector signed __int128 __ATTRS_o_ai
|
||||
vec_rlnm(vector signed __int128 __a, vector signed __int128 __b,
|
||||
vector signed __int128 __c) {
|
||||
// Merge __b and __c using an appropriate shuffle.
|
||||
vector unsigned char TmpB = (vector unsigned char)__b;
|
||||
vector unsigned char TmpC = (vector unsigned char)__c;
|
||||
vector unsigned char MaskAndShift =
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
__builtin_shufflevector(TmpB, TmpC, -1, -1, -1, -1, -1, -1, -1, -1, 16, 0,
|
||||
1, -1, -1, -1, -1, -1);
|
||||
#else
|
||||
__builtin_shufflevector(TmpB, TmpC, -1, -1, -1, -1, -1, 31, 30, 15, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1);
|
||||
#endif
|
||||
return __builtin_altivec_vrlqnm(__a, (vector unsigned __int128) MaskAndShift);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* vec_vrlb */
|
||||
|
||||
static __inline__ vector signed char __ATTRS_o_ai
|
||||
@@ -7771,6 +8135,18 @@ vec_vrsqrtefp(vector float __a) {
|
||||
return __builtin_altivec_vrsqrtefp(__a);
|
||||
}
|
||||
|
||||
/* vec_xvtsqrt */
|
||||
|
||||
#ifdef __VSX__
|
||||
static __inline__ int __ATTRS_o_ai vec_test_swsqrt(vector double __a) {
|
||||
return __builtin_vsx_xvtsqrtdp(__a);
|
||||
}
|
||||
|
||||
static __inline__ int __ATTRS_o_ai vec_test_swsqrts(vector float __a) {
|
||||
return __builtin_vsx_xvtsqrtsp(__a);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* vec_sel */
|
||||
|
||||
#define __builtin_altivec_vsel_4si vec_sel
|
||||
@@ -13900,6 +14276,18 @@ static __inline__ int __ATTRS_o_ai vec_all_eq(vector double __a,
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __POWER10_VECTOR__
|
||||
static __inline__ int __ATTRS_o_ai vec_all_eq(vector signed __int128 __a,
|
||||
vector signed __int128 __b) {
|
||||
return __builtin_altivec_vcmpequq_p(__CR6_LT, __a, __b);
|
||||
}
|
||||
|
||||
static __inline__ int __ATTRS_o_ai vec_all_eq(vector unsigned __int128 __a,
|
||||
vector unsigned __int128 __b) {
|
||||
return __builtin_altivec_vcmpequq_p(__CR6_LT, __a, __b);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* vec_all_ge */
|
||||
|
||||
static __inline__ int __ATTRS_o_ai vec_all_ge(vector signed char __a,
|
||||
@@ -14071,6 +14459,18 @@ static __inline__ int __ATTRS_o_ai vec_all_ge(vector double __a,
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __POWER10_VECTOR__
|
||||
static __inline__ int __ATTRS_o_ai vec_all_ge(vector signed __int128 __a,
|
||||
vector signed __int128 __b) {
|
||||
return __builtin_altivec_vcmpgtsq_p(__CR6_EQ, __b, __a);
|
||||
}
|
||||
|
||||
static __inline__ int __ATTRS_o_ai vec_all_ge(vector unsigned __int128 __a,
|
||||
vector unsigned __int128 __b) {
|
||||
return __builtin_altivec_vcmpgtuq_p(__CR6_EQ, __b, __a);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* vec_all_gt */
|
||||
|
||||
static __inline__ int __ATTRS_o_ai vec_all_gt(vector signed char __a,
|
||||
@@ -14242,6 +14642,18 @@ static __inline__ int __ATTRS_o_ai vec_all_gt(vector double __a,
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __POWER10_VECTOR__
|
||||
static __inline__ int __ATTRS_o_ai vec_all_gt(vector signed __int128 __a,
|
||||
vector signed __int128 __b) {
|
||||
return __builtin_altivec_vcmpgtsq_p(__CR6_LT, __a, __b);
|
||||
}
|
||||
|
||||
static __inline__ int __ATTRS_o_ai vec_all_gt(vector unsigned __int128 __a,
|
||||
vector unsigned __int128 __b) {
|
||||
return __builtin_altivec_vcmpgtuq_p(__CR6_LT, __a, __b);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* vec_all_in */
|
||||
|
||||
static __inline__ int __attribute__((__always_inline__))
|
||||
@@ -14421,6 +14833,18 @@ static __inline__ int __ATTRS_o_ai vec_all_le(vector double __a,
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __POWER10_VECTOR__
|
||||
static __inline__ int __ATTRS_o_ai vec_all_le(vector signed __int128 __a,
|
||||
vector signed __int128 __b) {
|
||||
return __builtin_altivec_vcmpgtsq_p(__CR6_EQ, __a, __b);
|
||||
}
|
||||
|
||||
static __inline__ int __ATTRS_o_ai vec_all_le(vector unsigned __int128 __a,
|
||||
vector unsigned __int128 __b) {
|
||||
return __builtin_altivec_vcmpgtuq_p(__CR6_EQ, __a, __b);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* vec_all_lt */
|
||||
|
||||
static __inline__ int __ATTRS_o_ai vec_all_lt(vector signed char __a,
|
||||
@@ -14593,6 +15017,18 @@ static __inline__ int __ATTRS_o_ai vec_all_lt(vector double __a,
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __POWER10_VECTOR__
|
||||
static __inline__ int __ATTRS_o_ai vec_all_lt(vector signed __int128 __a,
|
||||
vector signed __int128 __b) {
|
||||
return __builtin_altivec_vcmpgtsq_p(__CR6_LT, __b, __a);
|
||||
}
|
||||
|
||||
static __inline__ int __ATTRS_o_ai vec_all_lt(vector unsigned __int128 __a,
|
||||
vector unsigned __int128 __b) {
|
||||
return __builtin_altivec_vcmpgtuq_p(__CR6_LT, __b, __a);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* vec_all_nan */
|
||||
|
||||
static __inline__ int __ATTRS_o_ai vec_all_nan(vector float __a) {
|
||||
@@ -14797,6 +15233,18 @@ static __inline__ int __ATTRS_o_ai vec_all_ne(vector double __a,
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __POWER10_VECTOR__
|
||||
static __inline__ int __ATTRS_o_ai vec_all_ne(vector signed __int128 __a,
|
||||
vector signed __int128 __b) {
|
||||
return __builtin_altivec_vcmpequq_p(__CR6_EQ, __a, __b);
|
||||
}
|
||||
|
||||
static __inline__ int __ATTRS_o_ai vec_all_ne(vector unsigned __int128 __a,
|
||||
vector unsigned __int128 __b) {
|
||||
return __builtin_altivec_vcmpequq_p(__CR6_EQ, __a, __b);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* vec_all_nge */
|
||||
|
||||
static __inline__ int __ATTRS_o_ai vec_all_nge(vector float __a,
|
||||
@@ -15042,6 +15490,18 @@ static __inline__ int __ATTRS_o_ai vec_any_eq(vector double __a,
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __POWER10_VECTOR__
|
||||
static __inline__ int __ATTRS_o_ai vec_any_eq(vector signed __int128 __a,
|
||||
vector signed __int128 __b) {
|
||||
return __builtin_altivec_vcmpequq_p(__CR6_EQ_REV, __a, __b);
|
||||
}
|
||||
|
||||
static __inline__ int __ATTRS_o_ai vec_any_eq(vector unsigned __int128 __a,
|
||||
vector unsigned __int128 __b) {
|
||||
return __builtin_altivec_vcmpequq_p(__CR6_EQ_REV, __a, __b);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* vec_any_ge */
|
||||
|
||||
static __inline__ int __ATTRS_o_ai vec_any_ge(vector signed char __a,
|
||||
@@ -15221,6 +15681,18 @@ static __inline__ int __ATTRS_o_ai vec_any_ge(vector double __a,
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __POWER10_VECTOR__
|
||||
static __inline__ int __ATTRS_o_ai vec_any_ge(vector signed __int128 __a,
|
||||
vector signed __int128 __b) {
|
||||
return __builtin_altivec_vcmpgtsq_p(__CR6_LT_REV, __b, __a);
|
||||
}
|
||||
|
||||
static __inline__ int __ATTRS_o_ai vec_any_ge(vector unsigned __int128 __a,
|
||||
vector unsigned __int128 __b) {
|
||||
return __builtin_altivec_vcmpgtuq_p(__CR6_LT_REV, __b, __a);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* vec_any_gt */
|
||||
|
||||
static __inline__ int __ATTRS_o_ai vec_any_gt(vector signed char __a,
|
||||
@@ -15400,6 +15872,18 @@ static __inline__ int __ATTRS_o_ai vec_any_gt(vector double __a,
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __POWER10_VECTOR__
|
||||
static __inline__ int __ATTRS_o_ai vec_any_gt(vector signed __int128 __a,
|
||||
vector signed __int128 __b) {
|
||||
return __builtin_altivec_vcmpgtsq_p(__CR6_EQ_REV, __a, __b);
|
||||
}
|
||||
|
||||
static __inline__ int __ATTRS_o_ai vec_any_gt(vector unsigned __int128 __a,
|
||||
vector unsigned __int128 __b) {
|
||||
return __builtin_altivec_vcmpgtuq_p(__CR6_EQ_REV, __a, __b);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* vec_any_le */
|
||||
|
||||
static __inline__ int __ATTRS_o_ai vec_any_le(vector signed char __a,
|
||||
@@ -15579,6 +16063,18 @@ static __inline__ int __ATTRS_o_ai vec_any_le(vector double __a,
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __POWER10_VECTOR__
|
||||
static __inline__ int __ATTRS_o_ai vec_any_le(vector signed __int128 __a,
|
||||
vector signed __int128 __b) {
|
||||
return __builtin_altivec_vcmpgtsq_p(__CR6_LT_REV, __a, __b);
|
||||
}
|
||||
|
||||
static __inline__ int __ATTRS_o_ai vec_any_le(vector unsigned __int128 __a,
|
||||
vector unsigned __int128 __b) {
|
||||
return __builtin_altivec_vcmpgtuq_p(__CR6_LT_REV, __a, __b);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* vec_any_lt */
|
||||
|
||||
static __inline__ int __ATTRS_o_ai vec_any_lt(vector signed char __a,
|
||||
@@ -15758,6 +16254,18 @@ static __inline__ int __ATTRS_o_ai vec_any_lt(vector double __a,
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __POWER10_VECTOR__
|
||||
static __inline__ int __ATTRS_o_ai vec_any_lt(vector signed __int128 __a,
|
||||
vector signed __int128 __b) {
|
||||
return __builtin_altivec_vcmpgtsq_p(__CR6_EQ_REV, __b, __a);
|
||||
}
|
||||
|
||||
static __inline__ int __ATTRS_o_ai vec_any_lt(vector unsigned __int128 __a,
|
||||
vector unsigned __int128 __b) {
|
||||
return __builtin_altivec_vcmpgtuq_p(__CR6_EQ_REV, __b, __a);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* vec_any_nan */
|
||||
|
||||
static __inline__ int __attribute__((__always_inline__))
|
||||
@@ -15953,6 +16461,18 @@ static __inline__ int __ATTRS_o_ai vec_any_ne(vector double __a,
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __POWER10_VECTOR__
|
||||
static __inline__ int __ATTRS_o_ai vec_any_ne(vector signed __int128 __a,
|
||||
vector signed __int128 __b) {
|
||||
return __builtin_altivec_vcmpequq_p(__CR6_LT_REV, __a, __b);
|
||||
}
|
||||
|
||||
static __inline__ int __ATTRS_o_ai vec_any_ne(vector unsigned __int128 __a,
|
||||
vector unsigned __int128 __b) {
|
||||
return __builtin_altivec_vcmpequq_p(__CR6_LT_REV, __a, __b);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* vec_any_nge */
|
||||
|
||||
static __inline__ int __attribute__((__always_inline__))
|
||||
@@ -16353,41 +16873,41 @@ typedef vector unsigned int unaligned_vec_uint __attribute__((aligned(1)));
|
||||
typedef vector float unaligned_vec_float __attribute__((aligned(1)));
|
||||
|
||||
static inline __ATTRS_o_ai vector signed char vec_xl(signed long long __offset,
|
||||
signed char *__ptr) {
|
||||
const signed char *__ptr) {
|
||||
return *(unaligned_vec_schar *)(__ptr + __offset);
|
||||
}
|
||||
|
||||
static inline __ATTRS_o_ai vector unsigned char
|
||||
vec_xl(signed long long __offset, unsigned char *__ptr) {
|
||||
vec_xl(signed long long __offset, const unsigned char *__ptr) {
|
||||
return *(unaligned_vec_uchar*)(__ptr + __offset);
|
||||
}
|
||||
|
||||
static inline __ATTRS_o_ai vector signed short vec_xl(signed long long __offset,
|
||||
signed short *__ptr) {
|
||||
const signed short *__ptr) {
|
||||
signed char *__addr = (signed char *)__ptr + __offset;
|
||||
return *(unaligned_vec_sshort *)__addr;
|
||||
}
|
||||
|
||||
static inline __ATTRS_o_ai vector unsigned short
|
||||
vec_xl(signed long long __offset, unsigned short *__ptr) {
|
||||
vec_xl(signed long long __offset, const unsigned short *__ptr) {
|
||||
signed char *__addr = (signed char *)__ptr + __offset;
|
||||
return *(unaligned_vec_ushort *)__addr;
|
||||
}
|
||||
|
||||
static inline __ATTRS_o_ai vector signed int vec_xl(signed long long __offset,
|
||||
signed int *__ptr) {
|
||||
const signed int *__ptr) {
|
||||
signed char *__addr = (signed char *)__ptr + __offset;
|
||||
return *(unaligned_vec_sint *)__addr;
|
||||
}
|
||||
|
||||
static inline __ATTRS_o_ai vector unsigned int vec_xl(signed long long __offset,
|
||||
unsigned int *__ptr) {
|
||||
const unsigned int *__ptr) {
|
||||
signed char *__addr = (signed char *)__ptr + __offset;
|
||||
return *(unaligned_vec_uint *)__addr;
|
||||
}
|
||||
|
||||
static inline __ATTRS_o_ai vector float vec_xl(signed long long __offset,
|
||||
float *__ptr) {
|
||||
const float *__ptr) {
|
||||
signed char *__addr = (signed char *)__ptr + __offset;
|
||||
return *(unaligned_vec_float *)__addr;
|
||||
}
|
||||
@@ -16398,19 +16918,19 @@ typedef vector unsigned long long unaligned_vec_ull __attribute__((aligned(1)));
|
||||
typedef vector double unaligned_vec_double __attribute__((aligned(1)));
|
||||
|
||||
static inline __ATTRS_o_ai vector signed long long
|
||||
vec_xl(signed long long __offset, signed long long *__ptr) {
|
||||
vec_xl(signed long long __offset, const signed long long *__ptr) {
|
||||
signed char *__addr = (signed char *)__ptr + __offset;
|
||||
return *(unaligned_vec_sll *)__addr;
|
||||
}
|
||||
|
||||
static inline __ATTRS_o_ai vector unsigned long long
|
||||
vec_xl(signed long long __offset, unsigned long long *__ptr) {
|
||||
vec_xl(signed long long __offset, const unsigned long long *__ptr) {
|
||||
signed char *__addr = (signed char *)__ptr + __offset;
|
||||
return *(unaligned_vec_ull *)__addr;
|
||||
}
|
||||
|
||||
static inline __ATTRS_o_ai vector double vec_xl(signed long long __offset,
|
||||
double *__ptr) {
|
||||
const double *__ptr) {
|
||||
signed char *__addr = (signed char *)__ptr + __offset;
|
||||
return *(unaligned_vec_double *)__addr;
|
||||
}
|
||||
@@ -16421,13 +16941,13 @@ typedef vector signed __int128 unaligned_vec_si128 __attribute__((aligned(1)));
|
||||
typedef vector unsigned __int128 unaligned_vec_ui128
|
||||
__attribute__((aligned(1)));
|
||||
static inline __ATTRS_o_ai vector signed __int128
|
||||
vec_xl(signed long long __offset, signed __int128 *__ptr) {
|
||||
vec_xl(signed long long __offset, const signed __int128 *__ptr) {
|
||||
signed char *__addr = (signed char *)__ptr + __offset;
|
||||
return *(unaligned_vec_si128 *)__addr;
|
||||
}
|
||||
|
||||
static inline __ATTRS_o_ai vector unsigned __int128
|
||||
vec_xl(signed long long __offset, unsigned __int128 *__ptr) {
|
||||
vec_xl(signed long long __offset, const unsigned __int128 *__ptr) {
|
||||
signed char *__addr = (signed char *)__ptr + __offset;
|
||||
return *(unaligned_vec_ui128 *)__addr;
|
||||
}
|
||||
@@ -16437,71 +16957,71 @@ vec_xl(signed long long __offset, unsigned __int128 *__ptr) {
|
||||
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
static __inline__ vector signed char __ATTRS_o_ai
|
||||
vec_xl_be(signed long long __offset, signed char *__ptr) {
|
||||
vec_xl_be(signed long long __offset, const signed char *__ptr) {
|
||||
vector signed char __vec = (vector signed char)__builtin_vsx_lxvd2x_be(__offset, __ptr);
|
||||
return __builtin_shufflevector(__vec, __vec, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14,
|
||||
13, 12, 11, 10, 9, 8);
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned char __ATTRS_o_ai
|
||||
vec_xl_be(signed long long __offset, unsigned char *__ptr) {
|
||||
vec_xl_be(signed long long __offset, const unsigned char *__ptr) {
|
||||
vector unsigned char __vec = (vector unsigned char)__builtin_vsx_lxvd2x_be(__offset, __ptr);
|
||||
return __builtin_shufflevector(__vec, __vec, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14,
|
||||
13, 12, 11, 10, 9, 8);
|
||||
}
|
||||
|
||||
static __inline__ vector signed short __ATTRS_o_ai
|
||||
vec_xl_be(signed long long __offset, signed short *__ptr) {
|
||||
vec_xl_be(signed long long __offset, const signed short *__ptr) {
|
||||
vector signed short __vec = (vector signed short)__builtin_vsx_lxvd2x_be(__offset, __ptr);
|
||||
return __builtin_shufflevector(__vec, __vec, 3, 2, 1, 0, 7, 6, 5, 4);
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned short __ATTRS_o_ai
|
||||
vec_xl_be(signed long long __offset, unsigned short *__ptr) {
|
||||
vec_xl_be(signed long long __offset, const unsigned short *__ptr) {
|
||||
vector unsigned short __vec = (vector unsigned short)__builtin_vsx_lxvd2x_be(__offset, __ptr);
|
||||
return __builtin_shufflevector(__vec, __vec, 3, 2, 1, 0, 7, 6, 5, 4);
|
||||
}
|
||||
|
||||
static __inline__ vector signed int __ATTRS_o_ai
|
||||
vec_xl_be(signed long long __offset, signed int *__ptr) {
|
||||
vec_xl_be(signed long long __offset, const signed int *__ptr) {
|
||||
return (vector signed int)__builtin_vsx_lxvw4x_be(__offset, __ptr);
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned int __ATTRS_o_ai
|
||||
vec_xl_be(signed long long __offset, unsigned int *__ptr) {
|
||||
vec_xl_be(signed long long __offset, const unsigned int *__ptr) {
|
||||
return (vector unsigned int)__builtin_vsx_lxvw4x_be(__offset, __ptr);
|
||||
}
|
||||
|
||||
static __inline__ vector float __ATTRS_o_ai
|
||||
vec_xl_be(signed long long __offset, float *__ptr) {
|
||||
vec_xl_be(signed long long __offset, const float *__ptr) {
|
||||
return (vector float)__builtin_vsx_lxvw4x_be(__offset, __ptr);
|
||||
}
|
||||
|
||||
#ifdef __VSX__
|
||||
static __inline__ vector signed long long __ATTRS_o_ai
|
||||
vec_xl_be(signed long long __offset, signed long long *__ptr) {
|
||||
vec_xl_be(signed long long __offset, const signed long long *__ptr) {
|
||||
return (vector signed long long)__builtin_vsx_lxvd2x_be(__offset, __ptr);
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned long long __ATTRS_o_ai
|
||||
vec_xl_be(signed long long __offset, unsigned long long *__ptr) {
|
||||
vec_xl_be(signed long long __offset, const unsigned long long *__ptr) {
|
||||
return (vector unsigned long long)__builtin_vsx_lxvd2x_be(__offset, __ptr);
|
||||
}
|
||||
|
||||
static __inline__ vector double __ATTRS_o_ai
|
||||
vec_xl_be(signed long long __offset, double *__ptr) {
|
||||
vec_xl_be(signed long long __offset, const double *__ptr) {
|
||||
return (vector double)__builtin_vsx_lxvd2x_be(__offset, __ptr);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
|
||||
static __inline__ vector signed __int128 __ATTRS_o_ai
|
||||
vec_xl_be(signed long long __offset, signed __int128 *__ptr) {
|
||||
vec_xl_be(signed long long __offset, const signed __int128 *__ptr) {
|
||||
return vec_xl(__offset, __ptr);
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned __int128 __ATTRS_o_ai
|
||||
vec_xl_be(signed long long __offset, unsigned __int128 *__ptr) {
|
||||
vec_xl_be(signed long long __offset, const unsigned __int128 *__ptr) {
|
||||
return vec_xl(__offset, __ptr);
|
||||
}
|
||||
#endif
|
||||
@@ -16509,6 +17029,54 @@ vec_xl_be(signed long long __offset, unsigned __int128 *__ptr) {
|
||||
#define vec_xl_be vec_xl
|
||||
#endif
|
||||
|
||||
#if defined(__POWER10_VECTOR__) && defined(__VSX__)
|
||||
|
||||
/* vect_xl_sext */
|
||||
|
||||
static __inline__ vector unsigned __int128 __ATTRS_o_ai
|
||||
vec_xl_sext(signed long long __offset, const signed char *__pointer) {
|
||||
return (vector unsigned __int128)*(__pointer + __offset);
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned __int128 __ATTRS_o_ai
|
||||
vec_xl_sext(signed long long __offset, const signed short *__pointer) {
|
||||
return (vector unsigned __int128)*(__pointer + __offset);
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned __int128 __ATTRS_o_ai
|
||||
vec_xl_sext(signed long long __offset, const signed int *__pointer) {
|
||||
return (vector unsigned __int128)*(__pointer + __offset);
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned __int128 __ATTRS_o_ai
|
||||
vec_xl_sext(signed long long __offset, const signed long long *__pointer) {
|
||||
return (vector unsigned __int128)*(__pointer + __offset);
|
||||
}
|
||||
|
||||
/* vec_xl_zext */
|
||||
|
||||
static __inline__ vector unsigned __int128 __ATTRS_o_ai
|
||||
vec_xl_zext(signed long long __offset, const unsigned char *__pointer) {
|
||||
return (vector unsigned __int128)*(__pointer + __offset);
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned __int128 __ATTRS_o_ai
|
||||
vec_xl_zext(signed long long __offset, const unsigned short *__pointer) {
|
||||
return (vector unsigned __int128)*(__pointer + __offset);
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned __int128 __ATTRS_o_ai
|
||||
vec_xl_zext(signed long long __offset, const unsigned int *__pointer) {
|
||||
return (vector unsigned __int128)*(__pointer + __offset);
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned __int128 __ATTRS_o_ai
|
||||
vec_xl_zext(signed long long __offset, const unsigned long long *__pointer) {
|
||||
return (vector unsigned __int128)*(__pointer + __offset);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/* vec_xst */
|
||||
|
||||
static inline __ATTRS_o_ai void vec_xst(vector signed char __vec,
|
||||
@@ -16597,6 +17165,58 @@ static inline __ATTRS_o_ai void vec_xst(vector unsigned __int128 __vec,
|
||||
}
|
||||
#endif
|
||||
|
||||
/* vec_xst_trunc */
|
||||
|
||||
#if defined(__POWER10_VECTOR__) && defined(__VSX__)
|
||||
static inline __ATTRS_o_ai void vec_xst_trunc(vector signed __int128 __vec,
|
||||
signed long long __offset,
|
||||
signed char *__ptr) {
|
||||
*(__ptr + __offset) = (signed char)__vec[0];
|
||||
}
|
||||
|
||||
static inline __ATTRS_o_ai void vec_xst_trunc(vector unsigned __int128 __vec,
|
||||
signed long long __offset,
|
||||
unsigned char *__ptr) {
|
||||
*(__ptr + __offset) = (unsigned char)__vec[0];
|
||||
}
|
||||
|
||||
static inline __ATTRS_o_ai void vec_xst_trunc(vector signed __int128 __vec,
|
||||
signed long long __offset,
|
||||
signed short *__ptr) {
|
||||
*(__ptr + __offset) = (signed short)__vec[0];
|
||||
}
|
||||
|
||||
static inline __ATTRS_o_ai void vec_xst_trunc(vector unsigned __int128 __vec,
|
||||
signed long long __offset,
|
||||
unsigned short *__ptr) {
|
||||
*(__ptr + __offset) = (unsigned short)__vec[0];
|
||||
}
|
||||
|
||||
static inline __ATTRS_o_ai void vec_xst_trunc(vector signed __int128 __vec,
|
||||
signed long long __offset,
|
||||
signed int *__ptr) {
|
||||
*(__ptr + __offset) = (signed int)__vec[0];
|
||||
}
|
||||
|
||||
static inline __ATTRS_o_ai void vec_xst_trunc(vector unsigned __int128 __vec,
|
||||
signed long long __offset,
|
||||
unsigned int *__ptr) {
|
||||
*(__ptr + __offset) = (unsigned int)__vec[0];
|
||||
}
|
||||
|
||||
static inline __ATTRS_o_ai void vec_xst_trunc(vector signed __int128 __vec,
|
||||
signed long long __offset,
|
||||
signed long long *__ptr) {
|
||||
*(__ptr + __offset) = (signed long long)__vec[0];
|
||||
}
|
||||
|
||||
static inline __ATTRS_o_ai void vec_xst_trunc(vector unsigned __int128 __vec,
|
||||
signed long long __offset,
|
||||
unsigned long long *__ptr) {
|
||||
*(__ptr + __offset) = (unsigned long long)__vec[0];
|
||||
}
|
||||
#endif
|
||||
|
||||
/* vec_xst_be */
|
||||
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
@@ -16763,6 +17383,100 @@ static vector signed char __ATTRS_o_ai vec_nabs(vector signed char __a) {
|
||||
}
|
||||
|
||||
#ifdef __POWER10_VECTOR__
|
||||
|
||||
/* vec_extractm */
|
||||
|
||||
static __inline__ unsigned int __ATTRS_o_ai
|
||||
vec_extractm(vector unsigned char __a) {
|
||||
return __builtin_altivec_vextractbm(__a);
|
||||
}
|
||||
|
||||
static __inline__ unsigned int __ATTRS_o_ai
|
||||
vec_extractm(vector unsigned short __a) {
|
||||
return __builtin_altivec_vextracthm(__a);
|
||||
}
|
||||
|
||||
static __inline__ unsigned int __ATTRS_o_ai
|
||||
vec_extractm(vector unsigned int __a) {
|
||||
return __builtin_altivec_vextractwm(__a);
|
||||
}
|
||||
|
||||
static __inline__ unsigned int __ATTRS_o_ai
|
||||
vec_extractm(vector unsigned long long __a) {
|
||||
return __builtin_altivec_vextractdm(__a);
|
||||
}
|
||||
|
||||
static __inline__ unsigned int __ATTRS_o_ai
|
||||
vec_extractm(vector unsigned __int128 __a) {
|
||||
return __builtin_altivec_vextractqm(__a);
|
||||
}
|
||||
|
||||
/* vec_expandm */
|
||||
|
||||
static __inline__ vector unsigned char __ATTRS_o_ai
|
||||
vec_expandm(vector unsigned char __a) {
|
||||
return __builtin_altivec_vexpandbm(__a);
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned short __ATTRS_o_ai
|
||||
vec_expandm(vector unsigned short __a) {
|
||||
return __builtin_altivec_vexpandhm(__a);
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned int __ATTRS_o_ai
|
||||
vec_expandm(vector unsigned int __a) {
|
||||
return __builtin_altivec_vexpandwm(__a);
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned long long __ATTRS_o_ai
|
||||
vec_expandm(vector unsigned long long __a) {
|
||||
return __builtin_altivec_vexpanddm(__a);
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned __int128 __ATTRS_o_ai
|
||||
vec_expandm(vector unsigned __int128 __a) {
|
||||
return __builtin_altivec_vexpandqm(__a);
|
||||
}
|
||||
|
||||
/* vec_cntm */
|
||||
|
||||
#define vec_cntm(__a, __mp) \
|
||||
_Generic((__a), vector unsigned char \
|
||||
: __builtin_altivec_vcntmbb((__a), (unsigned int)(__mp)), \
|
||||
vector unsigned short \
|
||||
: __builtin_altivec_vcntmbh((__a), (unsigned int)(__mp)), \
|
||||
vector unsigned int \
|
||||
: __builtin_altivec_vcntmbw((__a), (unsigned int)(__mp)), \
|
||||
vector unsigned long long \
|
||||
: __builtin_altivec_vcntmbd((__a), (unsigned int)(__mp)))
|
||||
|
||||
/* vec_gen[b|h|w|d|q]m */
|
||||
|
||||
static __inline__ vector unsigned char __ATTRS_o_ai
|
||||
vec_genbm(unsigned long long __bm) {
|
||||
return __builtin_altivec_mtvsrbm(__bm);
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned short __ATTRS_o_ai
|
||||
vec_genhm(unsigned long long __bm) {
|
||||
return __builtin_altivec_mtvsrhm(__bm);
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned int __ATTRS_o_ai
|
||||
vec_genwm(unsigned long long __bm) {
|
||||
return __builtin_altivec_mtvsrwm(__bm);
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned long long __ATTRS_o_ai
|
||||
vec_gendm(unsigned long long __bm) {
|
||||
return __builtin_altivec_mtvsrdm(__bm);
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned __int128 __ATTRS_o_ai
|
||||
vec_genqm(unsigned long long __bm) {
|
||||
return __builtin_altivec_mtvsrqm(__bm);
|
||||
}
|
||||
|
||||
/* vec_pdep */
|
||||
|
||||
static __inline__ vector unsigned long long __ATTRS_o_ai
|
||||
@@ -16881,6 +17595,38 @@ vec_cnttzm(vector unsigned long long __a, vector unsigned long long __b) {
|
||||
return __builtin_altivec_vctzdm(__a, __b);
|
||||
}
|
||||
|
||||
/* vec_mod */
|
||||
|
||||
static __inline__ vector signed int __ATTRS_o_ai
|
||||
vec_mod(vector signed int __a, vector signed int __b) {
|
||||
return __a % __b;
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned int __ATTRS_o_ai
|
||||
vec_mod(vector unsigned int __a, vector unsigned int __b) {
|
||||
return __a % __b;
|
||||
}
|
||||
|
||||
static __inline__ vector signed long long __ATTRS_o_ai
|
||||
vec_mod(vector signed long long __a, vector signed long long __b) {
|
||||
return __a % __b;
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned long long __ATTRS_o_ai
|
||||
vec_mod(vector unsigned long long __a, vector unsigned long long __b) {
|
||||
return __a % __b;
|
||||
}
|
||||
|
||||
static __inline__ vector signed __int128 __ATTRS_o_ai
|
||||
vec_mod(vector signed __int128 __a, vector signed __int128 __b) {
|
||||
return __a % __b;
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned __int128 __ATTRS_o_ai
|
||||
vec_mod(vector unsigned __int128 __a, vector unsigned __int128 __b) {
|
||||
return __a % __b;
|
||||
}
|
||||
|
||||
/* vec_sldbi */
|
||||
|
||||
#define vec_sldb(__a, __b, __c) __builtin_altivec_vsldbi(__a, __b, (__c & 0x7))
|
||||
@@ -17027,6 +17773,92 @@ vec_inserth(vector unsigned int __a, vector unsigned int __b,
|
||||
#endif
|
||||
}
|
||||
|
||||
/* vec_extractl */
|
||||
|
||||
static __inline__ vector unsigned long long __ATTRS_o_ai vec_extractl(
|
||||
vector unsigned char __a, vector unsigned char __b, unsigned int __c) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vextdubvrx(__a, __b, __c);
|
||||
#else
|
||||
vector unsigned long long __ret = __builtin_altivec_vextdubvlx(__a, __b, __c);
|
||||
return vec_sld(__ret, __ret, 8);
|
||||
#endif
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned long long __ATTRS_o_ai vec_extractl(
|
||||
vector unsigned short __a, vector unsigned short __b, unsigned int __c) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vextduhvrx(__a, __b, __c);
|
||||
#else
|
||||
vector unsigned long long __ret = __builtin_altivec_vextduhvlx(__a, __b, __c);
|
||||
return vec_sld(__ret, __ret, 8);
|
||||
#endif
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned long long __ATTRS_o_ai vec_extractl(
|
||||
vector unsigned int __a, vector unsigned int __b, unsigned int __c) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vextduwvrx(__a, __b, __c);
|
||||
#else
|
||||
vector unsigned long long __ret = __builtin_altivec_vextduwvlx(__a, __b, __c);
|
||||
return vec_sld(__ret, __ret, 8);
|
||||
#endif
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned long long __ATTRS_o_ai
|
||||
vec_extractl(vector unsigned long long __a, vector unsigned long long __b,
|
||||
unsigned int __c) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vextddvrx(__a, __b, __c);
|
||||
#else
|
||||
vector unsigned long long __ret = __builtin_altivec_vextddvlx(__a, __b, __c);
|
||||
return vec_sld(__ret, __ret, 8);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* vec_extracth */
|
||||
|
||||
static __inline__ vector unsigned long long __ATTRS_o_ai vec_extracth(
|
||||
vector unsigned char __a, vector unsigned char __b, unsigned int __c) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vextdubvlx(__a, __b, __c);
|
||||
#else
|
||||
vector unsigned long long __ret = __builtin_altivec_vextdubvrx(__a, __b, __c);
|
||||
return vec_sld(__ret, __ret, 8);
|
||||
#endif
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned long long __ATTRS_o_ai vec_extracth(
|
||||
vector unsigned short __a, vector unsigned short __b, unsigned int __c) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vextduhvlx(__a, __b, __c);
|
||||
#else
|
||||
vector unsigned long long __ret = __builtin_altivec_vextduhvrx(__a, __b, __c);
|
||||
return vec_sld(__ret, __ret, 8);
|
||||
#endif
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned long long __ATTRS_o_ai vec_extracth(
|
||||
vector unsigned int __a, vector unsigned int __b, unsigned int __c) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vextduwvlx(__a, __b, __c);
|
||||
#else
|
||||
vector unsigned long long __ret = __builtin_altivec_vextduwvrx(__a, __b, __c);
|
||||
return vec_sld(__ret, __ret, 8);
|
||||
#endif
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned long long __ATTRS_o_ai
|
||||
vec_extracth(vector unsigned long long __a, vector unsigned long long __b,
|
||||
unsigned int __c) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vextddvlx(__a, __b, __c);
|
||||
#else
|
||||
vector unsigned long long __ret = __builtin_altivec_vextddvrx(__a, __b, __c);
|
||||
return vec_sld(__ret, __ret, 8);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef __VSX__
|
||||
|
||||
/* vec_permx */
|
||||
@@ -17095,6 +17927,14 @@ vec_blendv(vector double __a, vector double __b,
|
||||
return __builtin_vsx_xxblendvd(__a, __b, __c);
|
||||
}
|
||||
|
||||
/* vec_replace_elt */
|
||||
|
||||
#define vec_replace_elt __builtin_altivec_vec_replace_elt
|
||||
|
||||
/* vec_replace_unaligned */
|
||||
|
||||
#define vec_replace_unaligned __builtin_altivec_vec_replace_unaligned
|
||||
|
||||
/* vec_splati */
|
||||
|
||||
#define vec_splati(__a) \
|
||||
@@ -17161,6 +18001,197 @@ vec_test_lsbb_all_zeros(vector unsigned char __a) {
|
||||
return __builtin_vsx_xvtlsbb(__a, 0);
|
||||
}
|
||||
#endif /* __VSX__ */
|
||||
|
||||
/* vec_stril */
|
||||
|
||||
static __inline__ vector unsigned char __ATTRS_o_ai
|
||||
vec_stril(vector unsigned char __a) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vstribr((vector signed char)__a);
|
||||
#else
|
||||
return __builtin_altivec_vstribl((vector signed char)__a);
|
||||
#endif
|
||||
}
|
||||
|
||||
static __inline__ vector signed char __ATTRS_o_ai
|
||||
vec_stril(vector signed char __a) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vstribr(__a);
|
||||
#else
|
||||
return __builtin_altivec_vstribl(__a);
|
||||
#endif
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned short __ATTRS_o_ai
|
||||
vec_stril(vector unsigned short __a) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vstrihr((vector signed short)__a);
|
||||
#else
|
||||
return __builtin_altivec_vstrihl((vector signed short)__a);
|
||||
#endif
|
||||
}
|
||||
|
||||
static __inline__ vector signed short __ATTRS_o_ai
|
||||
vec_stril(vector signed short __a) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vstrihr(__a);
|
||||
#else
|
||||
return __builtin_altivec_vstrihl(__a);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* vec_stril_p */
|
||||
|
||||
static __inline__ int __ATTRS_o_ai vec_stril_p(vector unsigned char __a) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vstribr_p(__CR6_EQ, (vector signed char)__a);
|
||||
#else
|
||||
return __builtin_altivec_vstribl_p(__CR6_EQ, (vector signed char)__a);
|
||||
#endif
|
||||
}
|
||||
|
||||
static __inline__ int __ATTRS_o_ai vec_stril_p(vector signed char __a) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vstribr_p(__CR6_EQ, __a);
|
||||
#else
|
||||
return __builtin_altivec_vstribl_p(__CR6_EQ, __a);
|
||||
#endif
|
||||
}
|
||||
|
||||
static __inline__ int __ATTRS_o_ai vec_stril_p(vector unsigned short __a) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vstrihr_p(__CR6_EQ, (vector signed short)__a);
|
||||
#else
|
||||
return __builtin_altivec_vstrihl_p(__CR6_EQ, (vector signed short)__a);
|
||||
#endif
|
||||
}
|
||||
|
||||
static __inline__ int __ATTRS_o_ai vec_stril_p(vector signed short __a) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vstrihr_p(__CR6_EQ, __a);
|
||||
#else
|
||||
return __builtin_altivec_vstrihl_p(__CR6_EQ, __a);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* vec_strir */
|
||||
|
||||
static __inline__ vector unsigned char __ATTRS_o_ai
|
||||
vec_strir(vector unsigned char __a) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vstribl((vector signed char)__a);
|
||||
#else
|
||||
return __builtin_altivec_vstribr((vector signed char)__a);
|
||||
#endif
|
||||
}
|
||||
|
||||
static __inline__ vector signed char __ATTRS_o_ai
|
||||
vec_strir(vector signed char __a) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vstribl(__a);
|
||||
#else
|
||||
return __builtin_altivec_vstribr(__a);
|
||||
#endif
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned short __ATTRS_o_ai
|
||||
vec_strir(vector unsigned short __a) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vstrihl((vector signed short)__a);
|
||||
#else
|
||||
return __builtin_altivec_vstrihr((vector signed short)__a);
|
||||
#endif
|
||||
}
|
||||
|
||||
static __inline__ vector signed short __ATTRS_o_ai
|
||||
vec_strir(vector signed short __a) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vstrihl(__a);
|
||||
#else
|
||||
return __builtin_altivec_vstrihr(__a);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* vec_strir_p */
|
||||
|
||||
static __inline__ int __ATTRS_o_ai vec_strir_p(vector unsigned char __a) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vstribl_p(__CR6_EQ, (vector signed char)__a);
|
||||
#else
|
||||
return __builtin_altivec_vstribr_p(__CR6_EQ, (vector signed char)__a);
|
||||
#endif
|
||||
}
|
||||
|
||||
static __inline__ int __ATTRS_o_ai vec_strir_p(vector signed char __a) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vstribl_p(__CR6_EQ, __a);
|
||||
#else
|
||||
return __builtin_altivec_vstribr_p(__CR6_EQ, __a);
|
||||
#endif
|
||||
}
|
||||
|
||||
static __inline__ int __ATTRS_o_ai vec_strir_p(vector unsigned short __a) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vstrihl_p(__CR6_EQ, (vector signed short)__a);
|
||||
#else
|
||||
return __builtin_altivec_vstrihr_p(__CR6_EQ, (vector signed short)__a);
|
||||
#endif
|
||||
}
|
||||
|
||||
static __inline__ int __ATTRS_o_ai vec_strir_p(vector signed short __a) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vstrihl_p(__CR6_EQ, __a);
|
||||
#else
|
||||
return __builtin_altivec_vstrihr_p(__CR6_EQ, __a);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* vs[l | r | ra] */
|
||||
|
||||
static __inline__ vector unsigned __int128 __ATTRS_o_ai
|
||||
vec_sl(vector unsigned __int128 __a, vector unsigned __int128 __b) {
|
||||
return __a << (__b % (vector unsigned __int128)(sizeof(unsigned __int128) *
|
||||
__CHAR_BIT__));
|
||||
}
|
||||
|
||||
static __inline__ vector signed __int128 __ATTRS_o_ai
|
||||
vec_sl(vector signed __int128 __a, vector unsigned __int128 __b) {
|
||||
return __a << (__b % (vector unsigned __int128)(sizeof(unsigned __int128) *
|
||||
__CHAR_BIT__));
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned __int128 __ATTRS_o_ai
|
||||
vec_sr(vector unsigned __int128 __a, vector unsigned __int128 __b) {
|
||||
return __a >> (__b % (vector unsigned __int128)(sizeof(unsigned __int128) *
|
||||
__CHAR_BIT__));
|
||||
}
|
||||
|
||||
static __inline__ vector signed __int128 __ATTRS_o_ai
|
||||
vec_sr(vector signed __int128 __a, vector unsigned __int128 __b) {
|
||||
return (
|
||||
vector signed __int128)(((vector unsigned __int128)__a) >>
|
||||
(__b %
|
||||
(vector unsigned __int128)(sizeof(
|
||||
unsigned __int128) *
|
||||
__CHAR_BIT__)));
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned __int128 __ATTRS_o_ai
|
||||
vec_sra(vector unsigned __int128 __a, vector unsigned __int128 __b) {
|
||||
return (
|
||||
vector unsigned __int128)(((vector signed __int128)__a) >>
|
||||
(__b %
|
||||
(vector unsigned __int128)(sizeof(
|
||||
unsigned __int128) *
|
||||
__CHAR_BIT__)));
|
||||
}
|
||||
|
||||
static __inline__ vector signed __int128 __ATTRS_o_ai
|
||||
vec_sra(vector signed __int128 __a, vector unsigned __int128 __b) {
|
||||
return __a >> (__b % (vector unsigned __int128)(sizeof(unsigned __int128) *
|
||||
__CHAR_BIT__));
|
||||
}
|
||||
|
||||
#endif /* __POWER10_VECTOR__ */
|
||||
|
||||
#undef __ATTRS_o_ai
|
||||
|
||||
Vendored
+70
-22
@@ -15,8 +15,8 @@
|
||||
#define __AMXINTRIN_H
|
||||
#ifdef __x86_64__
|
||||
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
|
||||
#define __DEFAULT_FN_ATTRS_TILE \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
|
||||
|
||||
/// Load tile configuration from a 64-byte memory location specified by
|
||||
/// "mem_addr". The tile configuration includes the tile type palette, the
|
||||
@@ -31,9 +31,8 @@
|
||||
///
|
||||
/// \param __config
|
||||
/// A pointer to 512-bits configuration
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_tile_loadconfig(const void *__config)
|
||||
{
|
||||
static __inline__ void __DEFAULT_FN_ATTRS_TILE
|
||||
_tile_loadconfig(const void *__config) {
|
||||
__builtin_ia32_tile_loadconfig(__config);
|
||||
}
|
||||
|
||||
@@ -48,9 +47,8 @@ _tile_loadconfig(const void *__config)
|
||||
///
|
||||
/// \param __config
|
||||
/// A pointer to 512-bits configuration
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_tile_storeconfig(void *__config)
|
||||
{
|
||||
static __inline__ void __DEFAULT_FN_ATTRS_TILE
|
||||
_tile_storeconfig(void *__config) {
|
||||
__builtin_ia32_tile_storeconfig(__config);
|
||||
}
|
||||
|
||||
@@ -60,9 +58,7 @@ _tile_storeconfig(void *__config)
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_tile_release(void)
|
||||
{
|
||||
static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
|
||||
__builtin_ia32_tilerelease();
|
||||
}
|
||||
|
||||
@@ -80,8 +76,9 @@ _tile_release(void)
|
||||
/// A pointer to base address.
|
||||
/// \param stride
|
||||
/// The stride between the rows' data to be loaded in memory.
|
||||
#define _tile_loadd(dst, base, stride) \
|
||||
__builtin_ia32_tileloadd64((dst), ((const void *)(base)), (__SIZE_TYPE__)(stride))
|
||||
#define _tile_loadd(dst, base, stride) \
|
||||
__builtin_ia32_tileloadd64((dst), ((const void *)(base)), \
|
||||
(__SIZE_TYPE__)(stride))
|
||||
|
||||
/// Load tile rows from memory specifieid by "base" address and "stride" into
|
||||
/// destination tile "dst" using the tile configuration previously configured
|
||||
@@ -99,8 +96,9 @@ _tile_release(void)
|
||||
/// A pointer to base address.
|
||||
/// \param stride
|
||||
/// The stride between the rows' data to be loaded in memory.
|
||||
#define _tile_stream_loadd(dst, base, stride) \
|
||||
__builtin_ia32_tileloaddt164((dst), ((const void *)(base)), (__SIZE_TYPE__)(stride))
|
||||
#define _tile_stream_loadd(dst, base, stride) \
|
||||
__builtin_ia32_tileloaddt164((dst), ((const void *)(base)), \
|
||||
(__SIZE_TYPE__)(stride))
|
||||
|
||||
/// Store the tile specified by "src" to memory specifieid by "base" address and
|
||||
/// "stride" using the tile configuration previously configured via
|
||||
@@ -116,7 +114,7 @@ _tile_release(void)
|
||||
/// A pointer to base address.
|
||||
/// \param stride
|
||||
/// The stride between the rows' data to be stored in memory.
|
||||
#define _tile_stored(dst, base, stride) \
|
||||
#define _tile_stored(dst, base, stride) \
|
||||
__builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))
|
||||
|
||||
/// Zero the tile specified by "tdest".
|
||||
@@ -145,7 +143,8 @@ _tile_release(void)
|
||||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param src1
|
||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||
#define _tile_dpbssd(dst, src0, src1) __builtin_ia32_tdpbssd((dst), (src0), (src1))
|
||||
#define _tile_dpbssd(dst, src0, src1) \
|
||||
__builtin_ia32_tdpbssd((dst), (src0), (src1))
|
||||
|
||||
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
|
||||
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
|
||||
@@ -163,7 +162,8 @@ _tile_release(void)
|
||||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param src1
|
||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||
#define _tile_dpbsud(dst, src0, src1) __builtin_ia32_tdpbsud((dst), (src0), (src1))
|
||||
#define _tile_dpbsud(dst, src0, src1) \
|
||||
__builtin_ia32_tdpbsud((dst), (src0), (src1))
|
||||
|
||||
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
|
||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
|
||||
@@ -181,7 +181,8 @@ _tile_release(void)
|
||||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param src1
|
||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||
#define _tile_dpbusd(dst, src0, src1) __builtin_ia32_tdpbusd((dst), (src0), (src1))
|
||||
#define _tile_dpbusd(dst, src0, src1) \
|
||||
__builtin_ia32_tdpbusd((dst), (src0), (src1))
|
||||
|
||||
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
|
||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
|
||||
@@ -199,7 +200,8 @@ _tile_release(void)
|
||||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param src1
|
||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||
#define _tile_dpbuud(dst, src0, src1) __builtin_ia32_tdpbuud((dst), (src0), (src1))
|
||||
#define _tile_dpbuud(dst, src0, src1) \
|
||||
__builtin_ia32_tdpbuud((dst), (src0), (src1))
|
||||
|
||||
/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
|
||||
/// src1, accumulating the intermediate single-precision (32-bit) floating-point
|
||||
@@ -216,10 +218,56 @@ _tile_release(void)
|
||||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param src1
|
||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||
#define _tile_dpbf16ps(dst, src0, src1) \
|
||||
#define _tile_dpbf16ps(dst, src0, src1) \
|
||||
__builtin_ia32_tdpbf16ps((dst), (src0), (src1))
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
#define __DEFAULT_FN_ATTRS_INT8 \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
|
||||
|
||||
typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
|
||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
|
||||
_tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
|
||||
__SIZE_TYPE__ stride) {
|
||||
return __builtin_ia32_tileloadd64_internal(m, n, base,
|
||||
(__SIZE_TYPE__)(stride));
|
||||
}
|
||||
|
||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
|
||||
_tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,
|
||||
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
|
||||
return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
|
||||
}
|
||||
|
||||
static __inline__ void __DEFAULT_FN_ATTRS_INT8
|
||||
_tile_stored_internal(unsigned short m, unsigned short n, void *base,
|
||||
__SIZE_TYPE__ stride, _tile1024i tile) {
|
||||
return __builtin_ia32_tilestored64_internal(m, n, base,
|
||||
(__SIZE_TYPE__)(stride), tile);
|
||||
}
|
||||
|
||||
typedef struct __tile1024i_str {
|
||||
const unsigned short row;
|
||||
const unsigned short col;
|
||||
_tile1024i tile;
|
||||
} __tile1024i;
|
||||
|
||||
__DEFAULT_FN_ATTRS_INT8
|
||||
static void __tile_loadd(__tile1024i *dst, const void *base,
|
||||
__SIZE_TYPE__ stride) {
|
||||
dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
|
||||
}
|
||||
|
||||
__DEFAULT_FN_ATTRS_INT8
|
||||
static void __tile_dpbsud(__tile1024i *dst, __tile1024i src1,
|
||||
__tile1024i src2) {
|
||||
dst->tile = _tile_dpbssd_internal(src1.row, src2.col, src1.col, dst->tile,
|
||||
src1.tile, src2.tile);
|
||||
}
|
||||
|
||||
__DEFAULT_FN_ATTRS_INT8
|
||||
static void __tile_stored(void *base, __SIZE_TYPE__ stride, __tile1024i src) {
|
||||
_tile_stored_internal(src.row, src.col, base, stride, src.tile);
|
||||
}
|
||||
|
||||
#endif /* __x86_64__ */
|
||||
#endif /* __AMXINTRIN_H */
|
||||
|
||||
Vendored
+441
-127
@@ -40429,6 +40429,150 @@ __ai float32x4_t vcaddq_rot90_f32(float32x4_t __p0, float32x4_t __p1) {
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
__ai float32x4_t vcmlaq_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
|
||||
float32x4_t __ret;
|
||||
__ret = (float32x4_t) __builtin_neon_vcmlaq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
|
||||
return __ret;
|
||||
}
|
||||
#else
|
||||
__ai float32x4_t vcmlaq_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
|
||||
float32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
|
||||
float32x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
|
||||
float32x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
|
||||
float32x4_t __ret;
|
||||
__ret = (float32x4_t) __builtin_neon_vcmlaq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 41);
|
||||
__ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
|
||||
return __ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
__ai float32x2_t vcmla_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
|
||||
float32x2_t __ret;
|
||||
__ret = (float32x2_t) __builtin_neon_vcmla_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
|
||||
return __ret;
|
||||
}
|
||||
#else
|
||||
__ai float32x2_t vcmla_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
|
||||
float32x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
|
||||
float32x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
|
||||
float32x2_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
|
||||
float32x2_t __ret;
|
||||
__ret = (float32x2_t) __builtin_neon_vcmla_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 9);
|
||||
__ret = __builtin_shufflevector(__ret, __ret, 1, 0);
|
||||
return __ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
__ai float32x4_t vcmlaq_rot180_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
|
||||
float32x4_t __ret;
|
||||
__ret = (float32x4_t) __builtin_neon_vcmlaq_rot180_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
|
||||
return __ret;
|
||||
}
|
||||
#else
|
||||
__ai float32x4_t vcmlaq_rot180_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
|
||||
float32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
|
||||
float32x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
|
||||
float32x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
|
||||
float32x4_t __ret;
|
||||
__ret = (float32x4_t) __builtin_neon_vcmlaq_rot180_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 41);
|
||||
__ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
|
||||
return __ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
__ai float32x2_t vcmla_rot180_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
|
||||
float32x2_t __ret;
|
||||
__ret = (float32x2_t) __builtin_neon_vcmla_rot180_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
|
||||
return __ret;
|
||||
}
|
||||
#else
|
||||
__ai float32x2_t vcmla_rot180_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
|
||||
float32x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
|
||||
float32x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
|
||||
float32x2_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
|
||||
float32x2_t __ret;
|
||||
__ret = (float32x2_t) __builtin_neon_vcmla_rot180_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 9);
|
||||
__ret = __builtin_shufflevector(__ret, __ret, 1, 0);
|
||||
return __ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
__ai float32x4_t vcmlaq_rot270_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
|
||||
float32x4_t __ret;
|
||||
__ret = (float32x4_t) __builtin_neon_vcmlaq_rot270_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
|
||||
return __ret;
|
||||
}
|
||||
#else
|
||||
__ai float32x4_t vcmlaq_rot270_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
|
||||
float32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
|
||||
float32x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
|
||||
float32x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
|
||||
float32x4_t __ret;
|
||||
__ret = (float32x4_t) __builtin_neon_vcmlaq_rot270_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 41);
|
||||
__ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
|
||||
return __ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
__ai float32x2_t vcmla_rot270_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
|
||||
float32x2_t __ret;
|
||||
__ret = (float32x2_t) __builtin_neon_vcmla_rot270_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
|
||||
return __ret;
|
||||
}
|
||||
#else
|
||||
__ai float32x2_t vcmla_rot270_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
|
||||
float32x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
|
||||
float32x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
|
||||
float32x2_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
|
||||
float32x2_t __ret;
|
||||
__ret = (float32x2_t) __builtin_neon_vcmla_rot270_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 9);
|
||||
__ret = __builtin_shufflevector(__ret, __ret, 1, 0);
|
||||
return __ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
__ai float32x4_t vcmlaq_rot90_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
|
||||
float32x4_t __ret;
|
||||
__ret = (float32x4_t) __builtin_neon_vcmlaq_rot90_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
|
||||
return __ret;
|
||||
}
|
||||
#else
|
||||
__ai float32x4_t vcmlaq_rot90_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
|
||||
float32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
|
||||
float32x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
|
||||
float32x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
|
||||
float32x4_t __ret;
|
||||
__ret = (float32x4_t) __builtin_neon_vcmlaq_rot90_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 41);
|
||||
__ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
|
||||
return __ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
__ai float32x2_t vcmla_rot90_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
|
||||
float32x2_t __ret;
|
||||
__ret = (float32x2_t) __builtin_neon_vcmla_rot90_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
|
||||
return __ret;
|
||||
}
|
||||
#else
|
||||
__ai float32x2_t vcmla_rot90_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
|
||||
float32x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
|
||||
float32x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
|
||||
float32x2_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
|
||||
float32x2_t __ret;
|
||||
__ret = (float32x2_t) __builtin_neon_vcmla_rot90_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 9);
|
||||
__ret = __builtin_shufflevector(__ret, __ret, 1, 0);
|
||||
return __ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
#if defined(__ARM_FEATURE_COMPLEX) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
@@ -40499,6 +40643,150 @@ __ai float16x8_t vcaddq_rot90_f16(float16x8_t __p0, float16x8_t __p1) {
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
__ai float16x8_t vcmlaq_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
|
||||
float16x8_t __ret;
|
||||
__ret = (float16x8_t) __builtin_neon_vcmlaq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 40);
|
||||
return __ret;
|
||||
}
|
||||
#else
|
||||
__ai float16x8_t vcmlaq_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
|
||||
float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
float16x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
float16x8_t __ret;
|
||||
__ret = (float16x8_t) __builtin_neon_vcmlaq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 40);
|
||||
__ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
return __ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
__ai float16x4_t vcmla_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
|
||||
float16x4_t __ret;
|
||||
__ret = (float16x4_t) __builtin_neon_vcmla_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 8);
|
||||
return __ret;
|
||||
}
|
||||
#else
|
||||
__ai float16x4_t vcmla_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
|
||||
float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
|
||||
float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
|
||||
float16x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
|
||||
float16x4_t __ret;
|
||||
__ret = (float16x4_t) __builtin_neon_vcmla_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 8);
|
||||
__ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
|
||||
return __ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
__ai float16x8_t vcmlaq_rot180_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
|
||||
float16x8_t __ret;
|
||||
__ret = (float16x8_t) __builtin_neon_vcmlaq_rot180_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 40);
|
||||
return __ret;
|
||||
}
|
||||
#else
|
||||
__ai float16x8_t vcmlaq_rot180_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
|
||||
float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
float16x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
float16x8_t __ret;
|
||||
__ret = (float16x8_t) __builtin_neon_vcmlaq_rot180_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 40);
|
||||
__ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
return __ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
__ai float16x4_t vcmla_rot180_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
|
||||
float16x4_t __ret;
|
||||
__ret = (float16x4_t) __builtin_neon_vcmla_rot180_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 8);
|
||||
return __ret;
|
||||
}
|
||||
#else
|
||||
__ai float16x4_t vcmla_rot180_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
|
||||
float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
|
||||
float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
|
||||
float16x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
|
||||
float16x4_t __ret;
|
||||
__ret = (float16x4_t) __builtin_neon_vcmla_rot180_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 8);
|
||||
__ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
|
||||
return __ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
__ai float16x8_t vcmlaq_rot270_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
|
||||
float16x8_t __ret;
|
||||
__ret = (float16x8_t) __builtin_neon_vcmlaq_rot270_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 40);
|
||||
return __ret;
|
||||
}
|
||||
#else
|
||||
__ai float16x8_t vcmlaq_rot270_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
|
||||
float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
float16x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
float16x8_t __ret;
|
||||
__ret = (float16x8_t) __builtin_neon_vcmlaq_rot270_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 40);
|
||||
__ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
return __ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
__ai float16x4_t vcmla_rot270_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
|
||||
float16x4_t __ret;
|
||||
__ret = (float16x4_t) __builtin_neon_vcmla_rot270_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 8);
|
||||
return __ret;
|
||||
}
|
||||
#else
|
||||
__ai float16x4_t vcmla_rot270_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
|
||||
float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
|
||||
float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
|
||||
float16x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
|
||||
float16x4_t __ret;
|
||||
__ret = (float16x4_t) __builtin_neon_vcmla_rot270_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 8);
|
||||
__ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
|
||||
return __ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
__ai float16x8_t vcmlaq_rot90_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
|
||||
float16x8_t __ret;
|
||||
__ret = (float16x8_t) __builtin_neon_vcmlaq_rot90_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 40);
|
||||
return __ret;
|
||||
}
|
||||
#else
|
||||
__ai float16x8_t vcmlaq_rot90_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
|
||||
float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
float16x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
float16x8_t __ret;
|
||||
__ret = (float16x8_t) __builtin_neon_vcmlaq_rot90_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 40);
|
||||
__ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
return __ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
__ai float16x4_t vcmla_rot90_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
|
||||
float16x4_t __ret;
|
||||
__ret = (float16x4_t) __builtin_neon_vcmla_rot90_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 8);
|
||||
return __ret;
|
||||
}
|
||||
#else
|
||||
__ai float16x4_t vcmla_rot90_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
|
||||
float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
|
||||
float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
|
||||
float16x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
|
||||
float16x4_t __ret;
|
||||
__ret = (float16x4_t) __builtin_neon_vcmla_rot90_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 8);
|
||||
__ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
|
||||
return __ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
#if defined(__ARM_FEATURE_COMPLEX) && defined(__aarch64__)
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
@@ -40535,6 +40823,98 @@ __ai float64x2_t vcaddq_rot90_f64(float64x2_t __p0, float64x2_t __p1) {
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
__ai float64x2_t vcmlaq_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
|
||||
float64x2_t __ret;
|
||||
__ret = (float64x2_t) __builtin_neon_vcmlaq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 42);
|
||||
return __ret;
|
||||
}
|
||||
#else
|
||||
__ai float64x2_t vcmlaq_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
|
||||
float64x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
|
||||
float64x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
|
||||
float64x2_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
|
||||
float64x2_t __ret;
|
||||
__ret = (float64x2_t) __builtin_neon_vcmlaq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 42);
|
||||
__ret = __builtin_shufflevector(__ret, __ret, 1, 0);
|
||||
return __ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
__ai float64x1_t vcmla_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) {
|
||||
float64x1_t __ret;
|
||||
__ret = (float64x1_t) __builtin_neon_vcmla_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 10);
|
||||
return __ret;
|
||||
}
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
__ai float64x2_t vcmlaq_rot180_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
|
||||
float64x2_t __ret;
|
||||
__ret = (float64x2_t) __builtin_neon_vcmlaq_rot180_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 42);
|
||||
return __ret;
|
||||
}
|
||||
#else
|
||||
__ai float64x2_t vcmlaq_rot180_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
|
||||
float64x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
|
||||
float64x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
|
||||
float64x2_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
|
||||
float64x2_t __ret;
|
||||
__ret = (float64x2_t) __builtin_neon_vcmlaq_rot180_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 42);
|
||||
__ret = __builtin_shufflevector(__ret, __ret, 1, 0);
|
||||
return __ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
__ai float64x1_t vcmla_rot180_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) {
|
||||
float64x1_t __ret;
|
||||
__ret = (float64x1_t) __builtin_neon_vcmla_rot180_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 10);
|
||||
return __ret;
|
||||
}
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
__ai float64x2_t vcmlaq_rot270_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
|
||||
float64x2_t __ret;
|
||||
__ret = (float64x2_t) __builtin_neon_vcmlaq_rot270_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 42);
|
||||
return __ret;
|
||||
}
|
||||
#else
|
||||
__ai float64x2_t vcmlaq_rot270_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
|
||||
float64x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
|
||||
float64x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
|
||||
float64x2_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
|
||||
float64x2_t __ret;
|
||||
__ret = (float64x2_t) __builtin_neon_vcmlaq_rot270_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 42);
|
||||
__ret = __builtin_shufflevector(__ret, __ret, 1, 0);
|
||||
return __ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
__ai float64x1_t vcmla_rot270_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) {
|
||||
float64x1_t __ret;
|
||||
__ret = (float64x1_t) __builtin_neon_vcmla_rot270_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 10);
|
||||
return __ret;
|
||||
}
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
__ai float64x2_t vcmlaq_rot90_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
|
||||
float64x2_t __ret;
|
||||
__ret = (float64x2_t) __builtin_neon_vcmlaq_rot90_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 42);
|
||||
return __ret;
|
||||
}
|
||||
#else
|
||||
__ai float64x2_t vcmlaq_rot90_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
|
||||
float64x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
|
||||
float64x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
|
||||
float64x2_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
|
||||
float64x2_t __ret;
|
||||
__ret = (float64x2_t) __builtin_neon_vcmlaq_rot90_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 42);
|
||||
__ret = __builtin_shufflevector(__ret, __ret, 1, 0);
|
||||
return __ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
__ai float64x1_t vcmla_rot90_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) {
|
||||
float64x1_t __ret;
|
||||
__ret = (float64x1_t) __builtin_neon_vcmla_rot90_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 10);
|
||||
return __ret;
|
||||
}
|
||||
#endif
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
@@ -45860,9 +46240,9 @@ __ai uint64_t vceqd_u64(uint64_t __p0, uint64_t __p1) {
|
||||
__ret = (uint64_t) __builtin_neon_vceqd_u64(__p0, __p1);
|
||||
return __ret;
|
||||
}
|
||||
__ai int64_t vceqd_s64(int64_t __p0, int64_t __p1) {
|
||||
int64_t __ret;
|
||||
__ret = (int64_t) __builtin_neon_vceqd_s64(__p0, __p1);
|
||||
__ai uint64_t vceqd_s64(int64_t __p0, int64_t __p1) {
|
||||
uint64_t __ret;
|
||||
__ret = (uint64_t) __builtin_neon_vceqd_s64(__p0, __p1);
|
||||
return __ret;
|
||||
}
|
||||
__ai uint64_t vceqd_f64(float64_t __p0, float64_t __p1) {
|
||||
@@ -45896,22 +46276,6 @@ __ai uint64x1_t vceqz_p64(poly64x1_t __p0) {
|
||||
__ret = (uint64x1_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 19);
|
||||
return __ret;
|
||||
}
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
__ai uint16x4_t vceqz_p16(poly16x4_t __p0) {
|
||||
uint16x4_t __ret;
|
||||
__ret = (uint16x4_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 17);
|
||||
return __ret;
|
||||
}
|
||||
#else
|
||||
__ai uint16x4_t vceqz_p16(poly16x4_t __p0) {
|
||||
poly16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
|
||||
uint16x4_t __ret;
|
||||
__ret = (uint16x4_t) __builtin_neon_vceqz_v((int8x8_t)__rev0, 17);
|
||||
__ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
|
||||
return __ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
__ai uint8x16_t vceqzq_p8(poly8x16_t __p0) {
|
||||
uint8x16_t __ret;
|
||||
@@ -45944,22 +46308,6 @@ __ai uint64x2_t vceqzq_p64(poly64x2_t __p0) {
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
__ai uint16x8_t vceqzq_p16(poly16x8_t __p0) {
|
||||
uint16x8_t __ret;
|
||||
__ret = (uint16x8_t) __builtin_neon_vceqzq_v((int8x16_t)__p0, 49);
|
||||
return __ret;
|
||||
}
|
||||
#else
|
||||
__ai uint16x8_t vceqzq_p16(poly16x8_t __p0) {
|
||||
poly16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
uint16x8_t __ret;
|
||||
__ret = (uint16x8_t) __builtin_neon_vceqzq_v((int8x16_t)__rev0, 49);
|
||||
__ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
return __ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
__ai uint8x16_t vceqzq_u8(uint8x16_t __p0) {
|
||||
uint8x16_t __ret;
|
||||
@@ -46252,9 +46600,9 @@ __ai uint64_t vceqzd_u64(uint64_t __p0) {
|
||||
__ret = (uint64_t) __builtin_neon_vceqzd_u64(__p0);
|
||||
return __ret;
|
||||
}
|
||||
__ai int64_t vceqzd_s64(int64_t __p0) {
|
||||
int64_t __ret;
|
||||
__ret = (int64_t) __builtin_neon_vceqzd_s64(__p0);
|
||||
__ai uint64_t vceqzd_s64(int64_t __p0) {
|
||||
uint64_t __ret;
|
||||
__ret = (uint64_t) __builtin_neon_vceqzd_s64(__p0);
|
||||
return __ret;
|
||||
}
|
||||
__ai uint64_t vceqzd_f64(float64_t __p0) {
|
||||
@@ -46333,9 +46681,9 @@ __ai uint64x1_t vcge_s64(int64x1_t __p0, int64x1_t __p1) {
|
||||
__ret = (uint64x1_t)(__p0 >= __p1);
|
||||
return __ret;
|
||||
}
|
||||
__ai int64_t vcged_s64(int64_t __p0, int64_t __p1) {
|
||||
int64_t __ret;
|
||||
__ret = (int64_t) __builtin_neon_vcged_s64(__p0, __p1);
|
||||
__ai uint64_t vcged_s64(int64_t __p0, int64_t __p1) {
|
||||
uint64_t __ret;
|
||||
__ret = (uint64_t) __builtin_neon_vcged_s64(__p0, __p1);
|
||||
return __ret;
|
||||
}
|
||||
__ai uint64_t vcged_u64(uint64_t __p0, uint64_t __p1) {
|
||||
@@ -46523,9 +46871,9 @@ __ai uint16x4_t vcgez_s16(int16x4_t __p0) {
|
||||
}
|
||||
#endif
|
||||
|
||||
__ai int64_t vcgezd_s64(int64_t __p0) {
|
||||
int64_t __ret;
|
||||
__ret = (int64_t) __builtin_neon_vcgezd_s64(__p0);
|
||||
__ai uint64_t vcgezd_s64(int64_t __p0) {
|
||||
uint64_t __ret;
|
||||
__ret = (uint64_t) __builtin_neon_vcgezd_s64(__p0);
|
||||
return __ret;
|
||||
}
|
||||
__ai uint64_t vcgezd_f64(float64_t __p0) {
|
||||
@@ -46604,9 +46952,9 @@ __ai uint64x1_t vcgt_s64(int64x1_t __p0, int64x1_t __p1) {
|
||||
__ret = (uint64x1_t)(__p0 > __p1);
|
||||
return __ret;
|
||||
}
|
||||
__ai int64_t vcgtd_s64(int64_t __p0, int64_t __p1) {
|
||||
int64_t __ret;
|
||||
__ret = (int64_t) __builtin_neon_vcgtd_s64(__p0, __p1);
|
||||
__ai uint64_t vcgtd_s64(int64_t __p0, int64_t __p1) {
|
||||
uint64_t __ret;
|
||||
__ret = (uint64_t) __builtin_neon_vcgtd_s64(__p0, __p1);
|
||||
return __ret;
|
||||
}
|
||||
__ai uint64_t vcgtd_u64(uint64_t __p0, uint64_t __p1) {
|
||||
@@ -46794,9 +47142,9 @@ __ai uint16x4_t vcgtz_s16(int16x4_t __p0) {
|
||||
}
|
||||
#endif
|
||||
|
||||
__ai int64_t vcgtzd_s64(int64_t __p0) {
|
||||
int64_t __ret;
|
||||
__ret = (int64_t) __builtin_neon_vcgtzd_s64(__p0);
|
||||
__ai uint64_t vcgtzd_s64(int64_t __p0) {
|
||||
uint64_t __ret;
|
||||
__ret = (uint64_t) __builtin_neon_vcgtzd_s64(__p0);
|
||||
return __ret;
|
||||
}
|
||||
__ai uint64_t vcgtzd_f64(float64_t __p0) {
|
||||
@@ -46880,9 +47228,9 @@ __ai uint64_t vcled_u64(uint64_t __p0, uint64_t __p1) {
|
||||
__ret = (uint64_t) __builtin_neon_vcled_u64(__p0, __p1);
|
||||
return __ret;
|
||||
}
|
||||
__ai int64_t vcled_s64(int64_t __p0, int64_t __p1) {
|
||||
int64_t __ret;
|
||||
__ret = (int64_t) __builtin_neon_vcled_s64(__p0, __p1);
|
||||
__ai uint64_t vcled_s64(int64_t __p0, int64_t __p1) {
|
||||
uint64_t __ret;
|
||||
__ret = (uint64_t) __builtin_neon_vcled_s64(__p0, __p1);
|
||||
return __ret;
|
||||
}
|
||||
__ai uint64_t vcled_f64(float64_t __p0, float64_t __p1) {
|
||||
@@ -47065,9 +47413,9 @@ __ai uint16x4_t vclez_s16(int16x4_t __p0) {
|
||||
}
|
||||
#endif
|
||||
|
||||
__ai int64_t vclezd_s64(int64_t __p0) {
|
||||
int64_t __ret;
|
||||
__ret = (int64_t) __builtin_neon_vclezd_s64(__p0);
|
||||
__ai uint64_t vclezd_s64(int64_t __p0) {
|
||||
uint64_t __ret;
|
||||
__ret = (uint64_t) __builtin_neon_vclezd_s64(__p0);
|
||||
return __ret;
|
||||
}
|
||||
__ai uint64_t vclezd_f64(float64_t __p0) {
|
||||
@@ -47151,9 +47499,9 @@ __ai uint64_t vcltd_u64(uint64_t __p0, uint64_t __p1) {
|
||||
__ret = (uint64_t) __builtin_neon_vcltd_u64(__p0, __p1);
|
||||
return __ret;
|
||||
}
|
||||
__ai int64_t vcltd_s64(int64_t __p0, int64_t __p1) {
|
||||
int64_t __ret;
|
||||
__ret = (int64_t) __builtin_neon_vcltd_s64(__p0, __p1);
|
||||
__ai uint64_t vcltd_s64(int64_t __p0, int64_t __p1) {
|
||||
uint64_t __ret;
|
||||
__ret = (uint64_t) __builtin_neon_vcltd_s64(__p0, __p1);
|
||||
return __ret;
|
||||
}
|
||||
__ai uint64_t vcltd_f64(float64_t __p0, float64_t __p1) {
|
||||
@@ -47336,9 +47684,9 @@ __ai uint16x4_t vcltz_s16(int16x4_t __p0) {
|
||||
}
|
||||
#endif
|
||||
|
||||
__ai int64_t vcltzd_s64(int64_t __p0) {
|
||||
int64_t __ret;
|
||||
__ret = (int64_t) __builtin_neon_vcltzd_s64(__p0);
|
||||
__ai uint64_t vcltzd_s64(int64_t __p0) {
|
||||
uint64_t __ret;
|
||||
__ret = (uint64_t) __builtin_neon_vcltzd_s64(__p0);
|
||||
return __ret;
|
||||
}
|
||||
__ai uint64_t vcltzd_f64(float64_t __p0) {
|
||||
@@ -52787,23 +53135,6 @@ __ai float64x1_t vmla_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2)
|
||||
})
|
||||
#endif
|
||||
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
__ai float64x2_t vmlaq_n_f64(float64x2_t __p0, float64x2_t __p1, float64_t __p2) {
|
||||
float64x2_t __ret;
|
||||
__ret = __p0 + __p1 * (float64x2_t) {__p2, __p2};
|
||||
return __ret;
|
||||
}
|
||||
#else
|
||||
__ai float64x2_t vmlaq_n_f64(float64x2_t __p0, float64x2_t __p1, float64_t __p2) {
|
||||
float64x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
|
||||
float64x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
|
||||
float64x2_t __ret;
|
||||
__ret = __rev0 + __rev1 * (float64x2_t) {__p2, __p2};
|
||||
__ret = __builtin_shufflevector(__ret, __ret, 1, 0);
|
||||
return __ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
#define vmlal_high_lane_u32(__p0_443, __p1_443, __p2_443, __p3_443) __extension__ ({ \
|
||||
uint64x2_t __s0_443 = __p0_443; \
|
||||
@@ -53355,23 +53686,6 @@ __ai float64x1_t vmls_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2)
|
||||
})
|
||||
#endif
|
||||
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
__ai float64x2_t vmlsq_n_f64(float64x2_t __p0, float64x2_t __p1, float64_t __p2) {
|
||||
float64x2_t __ret;
|
||||
__ret = __p0 - __p1 * (float64x2_t) {__p2, __p2};
|
||||
return __ret;
|
||||
}
|
||||
#else
|
||||
__ai float64x2_t vmlsq_n_f64(float64x2_t __p0, float64x2_t __p1, float64_t __p2) {
|
||||
float64x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
|
||||
float64x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
|
||||
float64x2_t __ret;
|
||||
__ret = __rev0 - __rev1 * (float64x2_t) {__p2, __p2};
|
||||
__ret = __builtin_shufflevector(__ret, __ret, 1, 0);
|
||||
return __ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
#define vmlsl_high_lane_u32(__p0_487, __p1_487, __p2_487, __p3_487) __extension__ ({ \
|
||||
uint64x2_t __s0_487 = __p0_487; \
|
||||
@@ -57188,30 +57502,30 @@ __ai int8x16_t vqmovn_high_s16(int8x8_t __p0, int16x8_t __p1) {
|
||||
}
|
||||
#endif
|
||||
|
||||
__ai int16_t vqmovuns_s32(int32_t __p0) {
|
||||
int16_t __ret;
|
||||
__ret = (int16_t) __builtin_neon_vqmovuns_s32(__p0);
|
||||
__ai uint16_t vqmovuns_s32(int32_t __p0) {
|
||||
uint16_t __ret;
|
||||
__ret = (uint16_t) __builtin_neon_vqmovuns_s32(__p0);
|
||||
return __ret;
|
||||
}
|
||||
__ai int32_t vqmovund_s64(int64_t __p0) {
|
||||
int32_t __ret;
|
||||
__ret = (int32_t) __builtin_neon_vqmovund_s64(__p0);
|
||||
__ai uint32_t vqmovund_s64(int64_t __p0) {
|
||||
uint32_t __ret;
|
||||
__ret = (uint32_t) __builtin_neon_vqmovund_s64(__p0);
|
||||
return __ret;
|
||||
}
|
||||
__ai int8_t vqmovunh_s16(int16_t __p0) {
|
||||
int8_t __ret;
|
||||
__ret = (int8_t) __builtin_neon_vqmovunh_s16(__p0);
|
||||
__ai uint8_t vqmovunh_s16(int16_t __p0) {
|
||||
uint8_t __ret;
|
||||
__ret = (uint8_t) __builtin_neon_vqmovunh_s16(__p0);
|
||||
return __ret;
|
||||
}
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
__ai uint16x8_t vqmovun_high_s32(int16x4_t __p0, int32x4_t __p1) {
|
||||
__ai uint16x8_t vqmovun_high_s32(uint16x4_t __p0, int32x4_t __p1) {
|
||||
uint16x8_t __ret;
|
||||
__ret = vcombine_u16((uint16x4_t)(__p0), vqmovun_s32(__p1));
|
||||
return __ret;
|
||||
}
|
||||
#else
|
||||
__ai uint16x8_t vqmovun_high_s32(int16x4_t __p0, int32x4_t __p1) {
|
||||
int16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
|
||||
__ai uint16x8_t vqmovun_high_s32(uint16x4_t __p0, int32x4_t __p1) {
|
||||
uint16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
|
||||
int32x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
|
||||
uint16x8_t __ret;
|
||||
__ret = __noswap_vcombine_u16((uint16x4_t)(__rev0), __noswap_vqmovun_s32(__rev1));
|
||||
@@ -57221,14 +57535,14 @@ __ai uint16x8_t vqmovun_high_s32(int16x4_t __p0, int32x4_t __p1) {
|
||||
#endif
|
||||
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
__ai uint32x4_t vqmovun_high_s64(int32x2_t __p0, int64x2_t __p1) {
|
||||
__ai uint32x4_t vqmovun_high_s64(uint32x2_t __p0, int64x2_t __p1) {
|
||||
uint32x4_t __ret;
|
||||
__ret = vcombine_u32((uint32x2_t)(__p0), vqmovun_s64(__p1));
|
||||
return __ret;
|
||||
}
|
||||
#else
|
||||
__ai uint32x4_t vqmovun_high_s64(int32x2_t __p0, int64x2_t __p1) {
|
||||
int32x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
|
||||
__ai uint32x4_t vqmovun_high_s64(uint32x2_t __p0, int64x2_t __p1) {
|
||||
uint32x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
|
||||
int64x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
|
||||
uint32x4_t __ret;
|
||||
__ret = __noswap_vcombine_u32((uint32x2_t)(__rev0), __noswap_vqmovun_s64(__rev1));
|
||||
@@ -57238,14 +57552,14 @@ __ai uint32x4_t vqmovun_high_s64(int32x2_t __p0, int64x2_t __p1) {
|
||||
#endif
|
||||
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
__ai uint8x16_t vqmovun_high_s16(int8x8_t __p0, int16x8_t __p1) {
|
||||
__ai uint8x16_t vqmovun_high_s16(uint8x8_t __p0, int16x8_t __p1) {
|
||||
uint8x16_t __ret;
|
||||
__ret = vcombine_u8((uint8x8_t)(__p0), vqmovun_s16(__p1));
|
||||
return __ret;
|
||||
}
|
||||
#else
|
||||
__ai uint8x16_t vqmovun_high_s16(int8x8_t __p0, int16x8_t __p1) {
|
||||
int8x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
__ai uint8x16_t vqmovun_high_s16(uint8x8_t __p0, int16x8_t __p1) {
|
||||
uint8x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
int16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
uint8x16_t __ret;
|
||||
__ret = __noswap_vcombine_u8((uint8x8_t)(__rev0), __noswap_vqmovun_s16(__rev1));
|
||||
@@ -57549,22 +57863,22 @@ __ai int16_t vqrdmulhh_s16(int16_t __p0, int16_t __p1) {
|
||||
})
|
||||
#endif
|
||||
|
||||
__ai uint8_t vqrshlb_u8(uint8_t __p0, uint8_t __p1) {
|
||||
__ai uint8_t vqrshlb_u8(uint8_t __p0, int8_t __p1) {
|
||||
uint8_t __ret;
|
||||
__ret = (uint8_t) __builtin_neon_vqrshlb_u8(__p0, __p1);
|
||||
return __ret;
|
||||
}
|
||||
__ai uint32_t vqrshls_u32(uint32_t __p0, uint32_t __p1) {
|
||||
__ai uint32_t vqrshls_u32(uint32_t __p0, int32_t __p1) {
|
||||
uint32_t __ret;
|
||||
__ret = (uint32_t) __builtin_neon_vqrshls_u32(__p0, __p1);
|
||||
return __ret;
|
||||
}
|
||||
__ai uint64_t vqrshld_u64(uint64_t __p0, uint64_t __p1) {
|
||||
__ai uint64_t vqrshld_u64(uint64_t __p0, int64_t __p1) {
|
||||
uint64_t __ret;
|
||||
__ret = (uint64_t) __builtin_neon_vqrshld_u64(__p0, __p1);
|
||||
return __ret;
|
||||
}
|
||||
__ai uint16_t vqrshlh_u16(uint16_t __p0, uint16_t __p1) {
|
||||
__ai uint16_t vqrshlh_u16(uint16_t __p0, int16_t __p1) {
|
||||
uint16_t __ret;
|
||||
__ret = (uint16_t) __builtin_neon_vqrshlh_u16(__p0, __p1);
|
||||
return __ret;
|
||||
@@ -57832,22 +58146,22 @@ __ai int16_t vqrshlh_s16(int16_t __p0, int16_t __p1) {
|
||||
__ret = (int8_t) __builtin_neon_vqrshrunh_n_s16(__s0, __p1); \
|
||||
__ret; \
|
||||
})
|
||||
__ai uint8_t vqshlb_u8(uint8_t __p0, uint8_t __p1) {
|
||||
__ai uint8_t vqshlb_u8(uint8_t __p0, int8_t __p1) {
|
||||
uint8_t __ret;
|
||||
__ret = (uint8_t) __builtin_neon_vqshlb_u8(__p0, __p1);
|
||||
return __ret;
|
||||
}
|
||||
__ai uint32_t vqshls_u32(uint32_t __p0, uint32_t __p1) {
|
||||
__ai uint32_t vqshls_u32(uint32_t __p0, int32_t __p1) {
|
||||
uint32_t __ret;
|
||||
__ret = (uint32_t) __builtin_neon_vqshls_u32(__p0, __p1);
|
||||
return __ret;
|
||||
}
|
||||
__ai uint64_t vqshld_u64(uint64_t __p0, uint64_t __p1) {
|
||||
__ai uint64_t vqshld_u64(uint64_t __p0, int64_t __p1) {
|
||||
uint64_t __ret;
|
||||
__ret = (uint64_t) __builtin_neon_vqshld_u64(__p0, __p1);
|
||||
return __ret;
|
||||
}
|
||||
__ai uint16_t vqshlh_u16(uint16_t __p0, uint16_t __p1) {
|
||||
__ai uint16_t vqshlh_u16(uint16_t __p0, int16_t __p1) {
|
||||
uint16_t __ret;
|
||||
__ret = (uint16_t) __builtin_neon_vqshlh_u16(__p0, __p1);
|
||||
return __ret;
|
||||
@@ -59452,7 +59766,7 @@ __ai float32_t vrecpxs_f32(float32_t __p0) {
|
||||
__ret = (float32_t) __builtin_neon_vrecpxs_f32(__p0);
|
||||
return __ret;
|
||||
}
|
||||
__ai uint64_t vrshld_u64(uint64_t __p0, uint64_t __p1) {
|
||||
__ai uint64_t vrshld_u64(uint64_t __p0, int64_t __p1) {
|
||||
uint64_t __ret;
|
||||
__ret = (uint64_t) __builtin_neon_vrshld_u64(__p0, __p1);
|
||||
return __ret;
|
||||
@@ -59853,7 +60167,7 @@ __ai int8x16_t vrsubhn_high_s16(int8x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
|
||||
__ret = (float64x1_t) __builtin_neon_vset_lane_f64(__s0, (float64x1_t)__s1, __p2); \
|
||||
__ret; \
|
||||
})
|
||||
__ai uint64_t vshld_u64(uint64_t __p0, uint64_t __p1) {
|
||||
__ai uint64_t vshld_u64(uint64_t __p0, int64_t __p1) {
|
||||
uint64_t __ret;
|
||||
__ret = (uint64_t) __builtin_neon_vshld_u64(__p0, __p1);
|
||||
return __ret;
|
||||
@@ -62423,9 +62737,9 @@ __ai uint64_t vtstd_u64(uint64_t __p0, uint64_t __p1) {
|
||||
__ret = (uint64_t) __builtin_neon_vtstd_u64(__p0, __p1);
|
||||
return __ret;
|
||||
}
|
||||
__ai int64_t vtstd_s64(int64_t __p0, int64_t __p1) {
|
||||
int64_t __ret;
|
||||
__ret = (int64_t) __builtin_neon_vtstd_s64(__p0, __p1);
|
||||
__ai uint64_t vtstd_s64(int64_t __p0, int64_t __p1) {
|
||||
uint64_t __ret;
|
||||
__ret = (uint64_t) __builtin_neon_vtstd_s64(__p0, __p1);
|
||||
return __ret;
|
||||
}
|
||||
__ai int8_t vuqaddb_s8(int8_t __p0, uint8_t __p1) {
|
||||
|
||||
Vendored
+80
-80
@@ -94,7 +94,7 @@ typedef __clang_svbfloat16x2_t svbfloat16x2_t;
|
||||
typedef __clang_svbfloat16x3_t svbfloat16x3_t;
|
||||
typedef __clang_svbfloat16x4_t svbfloat16x4_t;
|
||||
#endif
|
||||
typedef enum
|
||||
enum svpattern
|
||||
{
|
||||
SV_POW2 = 0,
|
||||
SV_VL1 = 1,
|
||||
@@ -113,9 +113,9 @@ typedef enum
|
||||
SV_MUL4 = 29,
|
||||
SV_MUL3 = 30,
|
||||
SV_ALL = 31
|
||||
} sv_pattern;
|
||||
};
|
||||
|
||||
typedef enum
|
||||
enum svprfop
|
||||
{
|
||||
SV_PLDL1KEEP = 0,
|
||||
SV_PLDL1STRM = 1,
|
||||
@@ -129,7 +129,7 @@ typedef enum
|
||||
SV_PSTL2STRM = 11,
|
||||
SV_PSTL3KEEP = 12,
|
||||
SV_PSTL3STRM = 13
|
||||
} sv_prfop;
|
||||
};
|
||||
|
||||
/* Function attributes */
|
||||
#define __aio static inline __attribute__((__always_inline__, __nodebug__, __overloadable__))
|
||||
@@ -10013,69 +10013,69 @@ int16_t svorv(svbool_t, svint16_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpfirst_b)))
|
||||
svbool_t svpfirst(svbool_t, svbool_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_u32base)))
|
||||
void svprfb_gather(svbool_t, svuint32_t, sv_prfop);
|
||||
void svprfb_gather(svbool_t, svuint32_t, enum svprfop);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_u64base)))
|
||||
void svprfb_gather(svbool_t, svuint64_t, sv_prfop);
|
||||
void svprfb_gather(svbool_t, svuint64_t, enum svprfop);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_u32base_offset)))
|
||||
void svprfb_gather_offset(svbool_t, svuint32_t, int64_t, sv_prfop);
|
||||
void svprfb_gather_offset(svbool_t, svuint32_t, int64_t, enum svprfop);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_u64base_offset)))
|
||||
void svprfb_gather_offset(svbool_t, svuint64_t, int64_t, sv_prfop);
|
||||
void svprfb_gather_offset(svbool_t, svuint64_t, int64_t, enum svprfop);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_s32offset)))
|
||||
void svprfb_gather_offset(svbool_t, void const *, svint32_t, sv_prfop);
|
||||
void svprfb_gather_offset(svbool_t, void const *, svint32_t, enum svprfop);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_u32offset)))
|
||||
void svprfb_gather_offset(svbool_t, void const *, svuint32_t, sv_prfop);
|
||||
void svprfb_gather_offset(svbool_t, void const *, svuint32_t, enum svprfop);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_s64offset)))
|
||||
void svprfb_gather_offset(svbool_t, void const *, svint64_t, sv_prfop);
|
||||
void svprfb_gather_offset(svbool_t, void const *, svint64_t, enum svprfop);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_u64offset)))
|
||||
void svprfb_gather_offset(svbool_t, void const *, svuint64_t, sv_prfop);
|
||||
void svprfb_gather_offset(svbool_t, void const *, svuint64_t, enum svprfop);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_u32base)))
|
||||
void svprfd_gather(svbool_t, svuint32_t, sv_prfop);
|
||||
void svprfd_gather(svbool_t, svuint32_t, enum svprfop);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_u64base)))
|
||||
void svprfd_gather(svbool_t, svuint64_t, sv_prfop);
|
||||
void svprfd_gather(svbool_t, svuint64_t, enum svprfop);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_u32base_index)))
|
||||
void svprfd_gather_index(svbool_t, svuint32_t, int64_t, sv_prfop);
|
||||
void svprfd_gather_index(svbool_t, svuint32_t, int64_t, enum svprfop);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_u64base_index)))
|
||||
void svprfd_gather_index(svbool_t, svuint64_t, int64_t, sv_prfop);
|
||||
void svprfd_gather_index(svbool_t, svuint64_t, int64_t, enum svprfop);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_s32index)))
|
||||
void svprfd_gather_index(svbool_t, void const *, svint32_t, sv_prfop);
|
||||
void svprfd_gather_index(svbool_t, void const *, svint32_t, enum svprfop);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_u32index)))
|
||||
void svprfd_gather_index(svbool_t, void const *, svuint32_t, sv_prfop);
|
||||
void svprfd_gather_index(svbool_t, void const *, svuint32_t, enum svprfop);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_s64index)))
|
||||
void svprfd_gather_index(svbool_t, void const *, svint64_t, sv_prfop);
|
||||
void svprfd_gather_index(svbool_t, void const *, svint64_t, enum svprfop);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_u64index)))
|
||||
void svprfd_gather_index(svbool_t, void const *, svuint64_t, sv_prfop);
|
||||
void svprfd_gather_index(svbool_t, void const *, svuint64_t, enum svprfop);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_u32base)))
|
||||
void svprfh_gather(svbool_t, svuint32_t, sv_prfop);
|
||||
void svprfh_gather(svbool_t, svuint32_t, enum svprfop);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_u64base)))
|
||||
void svprfh_gather(svbool_t, svuint64_t, sv_prfop);
|
||||
void svprfh_gather(svbool_t, svuint64_t, enum svprfop);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_u32base_index)))
|
||||
void svprfh_gather_index(svbool_t, svuint32_t, int64_t, sv_prfop);
|
||||
void svprfh_gather_index(svbool_t, svuint32_t, int64_t, enum svprfop);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_u64base_index)))
|
||||
void svprfh_gather_index(svbool_t, svuint64_t, int64_t, sv_prfop);
|
||||
void svprfh_gather_index(svbool_t, svuint64_t, int64_t, enum svprfop);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_s32index)))
|
||||
void svprfh_gather_index(svbool_t, void const *, svint32_t, sv_prfop);
|
||||
void svprfh_gather_index(svbool_t, void const *, svint32_t, enum svprfop);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_u32index)))
|
||||
void svprfh_gather_index(svbool_t, void const *, svuint32_t, sv_prfop);
|
||||
void svprfh_gather_index(svbool_t, void const *, svuint32_t, enum svprfop);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_s64index)))
|
||||
void svprfh_gather_index(svbool_t, void const *, svint64_t, sv_prfop);
|
||||
void svprfh_gather_index(svbool_t, void const *, svint64_t, enum svprfop);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_u64index)))
|
||||
void svprfh_gather_index(svbool_t, void const *, svuint64_t, sv_prfop);
|
||||
void svprfh_gather_index(svbool_t, void const *, svuint64_t, enum svprfop);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_u32base)))
|
||||
void svprfw_gather(svbool_t, svuint32_t, sv_prfop);
|
||||
void svprfw_gather(svbool_t, svuint32_t, enum svprfop);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_u64base)))
|
||||
void svprfw_gather(svbool_t, svuint64_t, sv_prfop);
|
||||
void svprfw_gather(svbool_t, svuint64_t, enum svprfop);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_u32base_index)))
|
||||
void svprfw_gather_index(svbool_t, svuint32_t, int64_t, sv_prfop);
|
||||
void svprfw_gather_index(svbool_t, svuint32_t, int64_t, enum svprfop);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_u64base_index)))
|
||||
void svprfw_gather_index(svbool_t, svuint64_t, int64_t, sv_prfop);
|
||||
void svprfw_gather_index(svbool_t, svuint64_t, int64_t, enum svprfop);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_s32index)))
|
||||
void svprfw_gather_index(svbool_t, void const *, svint32_t, sv_prfop);
|
||||
void svprfw_gather_index(svbool_t, void const *, svint32_t, enum svprfop);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_u32index)))
|
||||
void svprfw_gather_index(svbool_t, void const *, svuint32_t, sv_prfop);
|
||||
void svprfw_gather_index(svbool_t, void const *, svuint32_t, enum svprfop);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_s64index)))
|
||||
void svprfw_gather_index(svbool_t, void const *, svint64_t, sv_prfop);
|
||||
void svprfw_gather_index(svbool_t, void const *, svint64_t, enum svprfop);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_u64index)))
|
||||
void svprfw_gather_index(svbool_t, void const *, svuint64_t, sv_prfop);
|
||||
void svprfw_gather_index(svbool_t, void const *, svuint64_t, enum svprfop);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s8)))
|
||||
svint8_t svqadd(svint8_t, int8_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s32)))
|
||||
@@ -10117,13 +10117,13 @@ uint32_t svqdecb(uint32_t, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecb_n_u64)))
|
||||
uint64_t svqdecb(uint64_t, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecb_pat_n_s32)))
|
||||
int32_t svqdecb_pat(int32_t, sv_pattern, uint64_t);
|
||||
int32_t svqdecb_pat(int32_t, enum svpattern, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecb_pat_n_s64)))
|
||||
int64_t svqdecb_pat(int64_t, sv_pattern, uint64_t);
|
||||
int64_t svqdecb_pat(int64_t, enum svpattern, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecb_pat_n_u32)))
|
||||
uint32_t svqdecb_pat(uint32_t, sv_pattern, uint64_t);
|
||||
uint32_t svqdecb_pat(uint32_t, enum svpattern, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecb_pat_n_u64)))
|
||||
uint64_t svqdecb_pat(uint64_t, sv_pattern, uint64_t);
|
||||
uint64_t svqdecb_pat(uint64_t, enum svpattern, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_n_s32)))
|
||||
int32_t svqdecd(int32_t, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_n_s64)))
|
||||
@@ -10137,17 +10137,17 @@ svint64_t svqdecd(svint64_t, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_u64)))
|
||||
svuint64_t svqdecd(svuint64_t, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_pat_n_s32)))
|
||||
int32_t svqdecd_pat(int32_t, sv_pattern, uint64_t);
|
||||
int32_t svqdecd_pat(int32_t, enum svpattern, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_pat_n_s64)))
|
||||
int64_t svqdecd_pat(int64_t, sv_pattern, uint64_t);
|
||||
int64_t svqdecd_pat(int64_t, enum svpattern, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_pat_n_u32)))
|
||||
uint32_t svqdecd_pat(uint32_t, sv_pattern, uint64_t);
|
||||
uint32_t svqdecd_pat(uint32_t, enum svpattern, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_pat_n_u64)))
|
||||
uint64_t svqdecd_pat(uint64_t, sv_pattern, uint64_t);
|
||||
uint64_t svqdecd_pat(uint64_t, enum svpattern, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_pat_s64)))
|
||||
svint64_t svqdecd_pat(svint64_t, sv_pattern, uint64_t);
|
||||
svint64_t svqdecd_pat(svint64_t, enum svpattern, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_pat_u64)))
|
||||
svuint64_t svqdecd_pat(svuint64_t, sv_pattern, uint64_t);
|
||||
svuint64_t svqdecd_pat(svuint64_t, enum svpattern, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_n_s32)))
|
||||
int32_t svqdech(int32_t, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_n_s64)))
|
||||
@@ -10161,17 +10161,17 @@ svint16_t svqdech(svint16_t, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_u16)))
|
||||
svuint16_t svqdech(svuint16_t, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_pat_n_s32)))
|
||||
int32_t svqdech_pat(int32_t, sv_pattern, uint64_t);
|
||||
int32_t svqdech_pat(int32_t, enum svpattern, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_pat_n_s64)))
|
||||
int64_t svqdech_pat(int64_t, sv_pattern, uint64_t);
|
||||
int64_t svqdech_pat(int64_t, enum svpattern, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_pat_n_u32)))
|
||||
uint32_t svqdech_pat(uint32_t, sv_pattern, uint64_t);
|
||||
uint32_t svqdech_pat(uint32_t, enum svpattern, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_pat_n_u64)))
|
||||
uint64_t svqdech_pat(uint64_t, sv_pattern, uint64_t);
|
||||
uint64_t svqdech_pat(uint64_t, enum svpattern, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_pat_s16)))
|
||||
svint16_t svqdech_pat(svint16_t, sv_pattern, uint64_t);
|
||||
svint16_t svqdech_pat(svint16_t, enum svpattern, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_pat_u16)))
|
||||
svuint16_t svqdech_pat(svuint16_t, sv_pattern, uint64_t);
|
||||
svuint16_t svqdech_pat(svuint16_t, enum svpattern, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_n_s32_b8)))
|
||||
int32_t svqdecp_b8(int32_t, svbool_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_n_s32_b32)))
|
||||
@@ -10229,17 +10229,17 @@ svint32_t svqdecw(svint32_t, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_u32)))
|
||||
svuint32_t svqdecw(svuint32_t, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_pat_n_s32)))
|
||||
int32_t svqdecw_pat(int32_t, sv_pattern, uint64_t);
|
||||
int32_t svqdecw_pat(int32_t, enum svpattern, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_pat_n_s64)))
|
||||
int64_t svqdecw_pat(int64_t, sv_pattern, uint64_t);
|
||||
int64_t svqdecw_pat(int64_t, enum svpattern, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_pat_n_u32)))
|
||||
uint32_t svqdecw_pat(uint32_t, sv_pattern, uint64_t);
|
||||
uint32_t svqdecw_pat(uint32_t, enum svpattern, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_pat_n_u64)))
|
||||
uint64_t svqdecw_pat(uint64_t, sv_pattern, uint64_t);
|
||||
uint64_t svqdecw_pat(uint64_t, enum svpattern, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_pat_s32)))
|
||||
svint32_t svqdecw_pat(svint32_t, sv_pattern, uint64_t);
|
||||
svint32_t svqdecw_pat(svint32_t, enum svpattern, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_pat_u32)))
|
||||
svuint32_t svqdecw_pat(svuint32_t, sv_pattern, uint64_t);
|
||||
svuint32_t svqdecw_pat(svuint32_t, enum svpattern, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincb_n_s32)))
|
||||
int32_t svqincb(int32_t, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincb_n_s64)))
|
||||
@@ -10249,13 +10249,13 @@ uint32_t svqincb(uint32_t, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincb_n_u64)))
|
||||
uint64_t svqincb(uint64_t, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincb_pat_n_s32)))
|
||||
int32_t svqincb_pat(int32_t, sv_pattern, uint64_t);
|
||||
int32_t svqincb_pat(int32_t, enum svpattern, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincb_pat_n_s64)))
|
||||
int64_t svqincb_pat(int64_t, sv_pattern, uint64_t);
|
||||
int64_t svqincb_pat(int64_t, enum svpattern, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincb_pat_n_u32)))
|
||||
uint32_t svqincb_pat(uint32_t, sv_pattern, uint64_t);
|
||||
uint32_t svqincb_pat(uint32_t, enum svpattern, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincb_pat_n_u64)))
|
||||
uint64_t svqincb_pat(uint64_t, sv_pattern, uint64_t);
|
||||
uint64_t svqincb_pat(uint64_t, enum svpattern, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_n_s32)))
|
||||
int32_t svqincd(int32_t, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_n_s64)))
|
||||
@@ -10269,17 +10269,17 @@ svint64_t svqincd(svint64_t, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_u64)))
|
||||
svuint64_t svqincd(svuint64_t, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_pat_n_s32)))
|
||||
int32_t svqincd_pat(int32_t, sv_pattern, uint64_t);
|
||||
int32_t svqincd_pat(int32_t, enum svpattern, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_pat_n_s64)))
|
||||
int64_t svqincd_pat(int64_t, sv_pattern, uint64_t);
|
||||
int64_t svqincd_pat(int64_t, enum svpattern, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_pat_n_u32)))
|
||||
uint32_t svqincd_pat(uint32_t, sv_pattern, uint64_t);
|
||||
uint32_t svqincd_pat(uint32_t, enum svpattern, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_pat_n_u64)))
|
||||
uint64_t svqincd_pat(uint64_t, sv_pattern, uint64_t);
|
||||
uint64_t svqincd_pat(uint64_t, enum svpattern, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_pat_s64)))
|
||||
svint64_t svqincd_pat(svint64_t, sv_pattern, uint64_t);
|
||||
svint64_t svqincd_pat(svint64_t, enum svpattern, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_pat_u64)))
|
||||
svuint64_t svqincd_pat(svuint64_t, sv_pattern, uint64_t);
|
||||
svuint64_t svqincd_pat(svuint64_t, enum svpattern, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_n_s32)))
|
||||
int32_t svqinch(int32_t, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_n_s64)))
|
||||
@@ -10293,17 +10293,17 @@ svint16_t svqinch(svint16_t, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_u16)))
|
||||
svuint16_t svqinch(svuint16_t, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_pat_n_s32)))
|
||||
int32_t svqinch_pat(int32_t, sv_pattern, uint64_t);
|
||||
int32_t svqinch_pat(int32_t, enum svpattern, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_pat_n_s64)))
|
||||
int64_t svqinch_pat(int64_t, sv_pattern, uint64_t);
|
||||
int64_t svqinch_pat(int64_t, enum svpattern, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_pat_n_u32)))
|
||||
uint32_t svqinch_pat(uint32_t, sv_pattern, uint64_t);
|
||||
uint32_t svqinch_pat(uint32_t, enum svpattern, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_pat_n_u64)))
|
||||
uint64_t svqinch_pat(uint64_t, sv_pattern, uint64_t);
|
||||
uint64_t svqinch_pat(uint64_t, enum svpattern, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_pat_s16)))
|
||||
svint16_t svqinch_pat(svint16_t, sv_pattern, uint64_t);
|
||||
svint16_t svqinch_pat(svint16_t, enum svpattern, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_pat_u16)))
|
||||
svuint16_t svqinch_pat(svuint16_t, sv_pattern, uint64_t);
|
||||
svuint16_t svqinch_pat(svuint16_t, enum svpattern, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_n_s32_b8)))
|
||||
int32_t svqincp_b8(int32_t, svbool_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_n_s32_b32)))
|
||||
@@ -10361,17 +10361,17 @@ svint32_t svqincw(svint32_t, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_u32)))
|
||||
svuint32_t svqincw(svuint32_t, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_pat_n_s32)))
|
||||
int32_t svqincw_pat(int32_t, sv_pattern, uint64_t);
|
||||
int32_t svqincw_pat(int32_t, enum svpattern, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_pat_n_s64)))
|
||||
int64_t svqincw_pat(int64_t, sv_pattern, uint64_t);
|
||||
int64_t svqincw_pat(int64_t, enum svpattern, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_pat_n_u32)))
|
||||
uint32_t svqincw_pat(uint32_t, sv_pattern, uint64_t);
|
||||
uint32_t svqincw_pat(uint32_t, enum svpattern, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_pat_n_u64)))
|
||||
uint64_t svqincw_pat(uint64_t, sv_pattern, uint64_t);
|
||||
uint64_t svqincw_pat(uint64_t, enum svpattern, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_pat_s32)))
|
||||
svint32_t svqincw_pat(svint32_t, sv_pattern, uint64_t);
|
||||
svint32_t svqincw_pat(svint32_t, enum svpattern, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_pat_u32)))
|
||||
svuint32_t svqincw_pat(svuint32_t, sv_pattern, uint64_t);
|
||||
svuint32_t svqincw_pat(svuint32_t, enum svpattern, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_s8)))
|
||||
svint8_t svqsub(svint8_t, int8_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_s32)))
|
||||
|
||||
Vendored
+40
-117
@@ -9305,295 +9305,218 @@ _mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A)
|
||||
* This takes log2(n) steps where n is the number of elements in the vector.
|
||||
*/
|
||||
|
||||
#define _mm512_mask_reduce_operator(op) \
|
||||
__v4du __t1 = (__v4du)_mm512_extracti64x4_epi64(__W, 0); \
|
||||
__v4du __t2 = (__v4du)_mm512_extracti64x4_epi64(__W, 1); \
|
||||
__m256i __t3 = (__m256i)(__t1 op __t2); \
|
||||
__v2du __t4 = (__v2du)_mm256_extracti128_si256(__t3, 0); \
|
||||
__v2du __t5 = (__v2du)_mm256_extracti128_si256(__t3, 1); \
|
||||
__v2du __t6 = __t4 op __t5; \
|
||||
__v2du __t7 = __builtin_shufflevector(__t6, __t6, 1, 0); \
|
||||
__v2du __t8 = __t6 op __t7; \
|
||||
return __t8[0]
|
||||
|
||||
static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_add_epi64(__m512i __W) {
|
||||
_mm512_mask_reduce_operator(+);
|
||||
return __builtin_ia32_reduce_add_q512(__W);
|
||||
}
|
||||
|
||||
static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_epi64(__m512i __W) {
|
||||
_mm512_mask_reduce_operator(*);
|
||||
return __builtin_ia32_reduce_mul_q512(__W);
|
||||
}
|
||||
|
||||
static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_and_epi64(__m512i __W) {
|
||||
_mm512_mask_reduce_operator(&);
|
||||
return __builtin_ia32_reduce_and_q512(__W);
|
||||
}
|
||||
|
||||
static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_or_epi64(__m512i __W) {
|
||||
_mm512_mask_reduce_operator(|);
|
||||
return __builtin_ia32_reduce_or_q512(__W);
|
||||
}
|
||||
|
||||
static __inline__ long long __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W) {
|
||||
__W = _mm512_maskz_mov_epi64(__M, __W);
|
||||
_mm512_mask_reduce_operator(+);
|
||||
return __builtin_ia32_reduce_add_q512(__W);
|
||||
}
|
||||
|
||||
static __inline__ long long __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W) {
|
||||
__W = _mm512_mask_mov_epi64(_mm512_set1_epi64(1), __M, __W);
|
||||
_mm512_mask_reduce_operator(*);
|
||||
return __builtin_ia32_reduce_mul_q512(__W);
|
||||
}
|
||||
|
||||
static __inline__ long long __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W) {
|
||||
__W = _mm512_mask_mov_epi64(_mm512_set1_epi64(~0ULL), __M, __W);
|
||||
_mm512_mask_reduce_operator(&);
|
||||
return __builtin_ia32_reduce_and_q512(__W);
|
||||
}
|
||||
|
||||
static __inline__ long long __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W) {
|
||||
__W = _mm512_maskz_mov_epi64(__M, __W);
|
||||
_mm512_mask_reduce_operator(|);
|
||||
return __builtin_ia32_reduce_or_q512(__W);
|
||||
}
|
||||
#undef _mm512_mask_reduce_operator
|
||||
|
||||
#define _mm512_mask_reduce_operator(op) \
|
||||
__m256d __t1 = _mm512_extractf64x4_pd(__W, 0); \
|
||||
__m256d __t2 = _mm512_extractf64x4_pd(__W, 1); \
|
||||
__m256d __t3 = __t1 op __t2; \
|
||||
__m128d __t4 = _mm256_extractf128_pd(__t3, 0); \
|
||||
__m128d __t5 = _mm256_extractf128_pd(__t3, 1); \
|
||||
__m128d __t6 = __t4 op __t5; \
|
||||
__m128d __t7 = __builtin_shufflevector(__t6, __t6, 1, 0); \
|
||||
__m128d __t8 = __t6 op __t7; \
|
||||
return __t8[0]
|
||||
|
||||
static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_add_pd(__m512d __W) {
|
||||
_mm512_mask_reduce_operator(+);
|
||||
return __builtin_ia32_reduce_fadd_pd512(0.0, __W);
|
||||
}
|
||||
|
||||
static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_pd(__m512d __W) {
|
||||
_mm512_mask_reduce_operator(*);
|
||||
return __builtin_ia32_reduce_fmul_pd512(1.0, __W);
|
||||
}
|
||||
|
||||
static __inline__ double __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W) {
|
||||
__W = _mm512_maskz_mov_pd(__M, __W);
|
||||
_mm512_mask_reduce_operator(+);
|
||||
return __builtin_ia32_reduce_fadd_pd512(0.0, __W);
|
||||
}
|
||||
|
||||
static __inline__ double __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W) {
|
||||
__W = _mm512_mask_mov_pd(_mm512_set1_pd(1.0), __M, __W);
|
||||
_mm512_mask_reduce_operator(*);
|
||||
return __builtin_ia32_reduce_fmul_pd512(1.0, __W);
|
||||
}
|
||||
#undef _mm512_mask_reduce_operator
|
||||
|
||||
#define _mm512_mask_reduce_operator(op) \
|
||||
__v8su __t1 = (__v8su)_mm512_extracti64x4_epi64(__W, 0); \
|
||||
__v8su __t2 = (__v8su)_mm512_extracti64x4_epi64(__W, 1); \
|
||||
__m256i __t3 = (__m256i)(__t1 op __t2); \
|
||||
__v4su __t4 = (__v4su)_mm256_extracti128_si256(__t3, 0); \
|
||||
__v4su __t5 = (__v4su)_mm256_extracti128_si256(__t3, 1); \
|
||||
__v4su __t6 = __t4 op __t5; \
|
||||
__v4su __t7 = __builtin_shufflevector(__t6, __t6, 2, 3, 0, 1); \
|
||||
__v4su __t8 = __t6 op __t7; \
|
||||
__v4su __t9 = __builtin_shufflevector(__t8, __t8, 1, 0, 3, 2); \
|
||||
__v4su __t10 = __t8 op __t9; \
|
||||
return __t10[0]
|
||||
|
||||
static __inline__ int __DEFAULT_FN_ATTRS512
|
||||
_mm512_reduce_add_epi32(__m512i __W) {
|
||||
_mm512_mask_reduce_operator(+);
|
||||
return __builtin_ia32_reduce_add_d512((__v16si)__W);
|
||||
}
|
||||
|
||||
static __inline__ int __DEFAULT_FN_ATTRS512
|
||||
_mm512_reduce_mul_epi32(__m512i __W) {
|
||||
_mm512_mask_reduce_operator(*);
|
||||
return __builtin_ia32_reduce_mul_d512((__v16si)__W);
|
||||
}
|
||||
|
||||
static __inline__ int __DEFAULT_FN_ATTRS512
|
||||
_mm512_reduce_and_epi32(__m512i __W) {
|
||||
_mm512_mask_reduce_operator(&);
|
||||
return __builtin_ia32_reduce_and_d512((__v16si)__W);
|
||||
}
|
||||
|
||||
static __inline__ int __DEFAULT_FN_ATTRS512
|
||||
_mm512_reduce_or_epi32(__m512i __W) {
|
||||
_mm512_mask_reduce_operator(|);
|
||||
return __builtin_ia32_reduce_or_d512((__v16si)__W);
|
||||
}
|
||||
|
||||
static __inline__ int __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_reduce_add_epi32( __mmask16 __M, __m512i __W) {
|
||||
__W = _mm512_maskz_mov_epi32(__M, __W);
|
||||
_mm512_mask_reduce_operator(+);
|
||||
return __builtin_ia32_reduce_add_d512((__v16si)__W);
|
||||
}
|
||||
|
||||
static __inline__ int __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_reduce_mul_epi32( __mmask16 __M, __m512i __W) {
|
||||
__W = _mm512_mask_mov_epi32(_mm512_set1_epi32(1), __M, __W);
|
||||
_mm512_mask_reduce_operator(*);
|
||||
return __builtin_ia32_reduce_mul_d512((__v16si)__W);
|
||||
}
|
||||
|
||||
static __inline__ int __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_reduce_and_epi32( __mmask16 __M, __m512i __W) {
|
||||
__W = _mm512_mask_mov_epi32(_mm512_set1_epi32(~0U), __M, __W);
|
||||
_mm512_mask_reduce_operator(&);
|
||||
return __builtin_ia32_reduce_and_d512((__v16si)__W);
|
||||
}
|
||||
|
||||
static __inline__ int __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W) {
|
||||
__W = _mm512_maskz_mov_epi32(__M, __W);
|
||||
_mm512_mask_reduce_operator(|);
|
||||
return __builtin_ia32_reduce_or_d512((__v16si)__W);
|
||||
}
|
||||
#undef _mm512_mask_reduce_operator
|
||||
|
||||
#define _mm512_mask_reduce_operator(op) \
|
||||
__m256 __t1 = (__m256)_mm512_extractf64x4_pd((__m512d)__W, 0); \
|
||||
__m256 __t2 = (__m256)_mm512_extractf64x4_pd((__m512d)__W, 1); \
|
||||
__m256 __t3 = __t1 op __t2; \
|
||||
__m128 __t4 = _mm256_extractf128_ps(__t3, 0); \
|
||||
__m128 __t5 = _mm256_extractf128_ps(__t3, 1); \
|
||||
__m128 __t6 = __t4 op __t5; \
|
||||
__m128 __t7 = __builtin_shufflevector(__t6, __t6, 2, 3, 0, 1); \
|
||||
__m128 __t8 = __t6 op __t7; \
|
||||
__m128 __t9 = __builtin_shufflevector(__t8, __t8, 1, 0, 3, 2); \
|
||||
__m128 __t10 = __t8 op __t9; \
|
||||
return __t10[0]
|
||||
|
||||
static __inline__ float __DEFAULT_FN_ATTRS512
|
||||
_mm512_reduce_add_ps(__m512 __W) {
|
||||
_mm512_mask_reduce_operator(+);
|
||||
return __builtin_ia32_reduce_fadd_ps512(0.0f, __W);
|
||||
}
|
||||
|
||||
static __inline__ float __DEFAULT_FN_ATTRS512
|
||||
_mm512_reduce_mul_ps(__m512 __W) {
|
||||
_mm512_mask_reduce_operator(*);
|
||||
return __builtin_ia32_reduce_fmul_ps512(1.0f, __W);
|
||||
}
|
||||
|
||||
static __inline__ float __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W) {
|
||||
__W = _mm512_maskz_mov_ps(__M, __W);
|
||||
_mm512_mask_reduce_operator(+);
|
||||
return __builtin_ia32_reduce_fadd_ps512(0.0f, __W);
|
||||
}
|
||||
|
||||
static __inline__ float __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) {
|
||||
__W = _mm512_mask_mov_ps(_mm512_set1_ps(1.0f), __M, __W);
|
||||
_mm512_mask_reduce_operator(*);
|
||||
return __builtin_ia32_reduce_fmul_ps512(1.0f, __W);
|
||||
}
|
||||
#undef _mm512_mask_reduce_operator
|
||||
|
||||
#define _mm512_mask_reduce_operator(op) \
|
||||
__m512i __t1 = (__m512i)__builtin_shufflevector((__v8di)__V, (__v8di)__V, 4, 5, 6, 7, 0, 1, 2, 3); \
|
||||
__m512i __t2 = _mm512_##op(__V, __t1); \
|
||||
__m512i __t3 = (__m512i)__builtin_shufflevector((__v8di)__t2, (__v8di)__t2, 2, 3, 0, 1, 6, 7, 4, 5); \
|
||||
__m512i __t4 = _mm512_##op(__t2, __t3); \
|
||||
__m512i __t5 = (__m512i)__builtin_shufflevector((__v8di)__t4, (__v8di)__t4, 1, 0, 3, 2, 5, 4, 7, 6); \
|
||||
__v8di __t6 = (__v8di)_mm512_##op(__t4, __t5); \
|
||||
return __t6[0]
|
||||
|
||||
static __inline__ long long __DEFAULT_FN_ATTRS512
|
||||
_mm512_reduce_max_epi64(__m512i __V) {
|
||||
_mm512_mask_reduce_operator(max_epi64);
|
||||
return __builtin_ia32_reduce_smax_q512(__V);
|
||||
}
|
||||
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
|
||||
_mm512_reduce_max_epu64(__m512i __V) {
|
||||
_mm512_mask_reduce_operator(max_epu64);
|
||||
return __builtin_ia32_reduce_umax_q512(__V);
|
||||
}
|
||||
|
||||
static __inline__ long long __DEFAULT_FN_ATTRS512
|
||||
_mm512_reduce_min_epi64(__m512i __V) {
|
||||
_mm512_mask_reduce_operator(min_epi64);
|
||||
return __builtin_ia32_reduce_smin_q512(__V);
|
||||
}
|
||||
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
|
||||
_mm512_reduce_min_epu64(__m512i __V) {
|
||||
_mm512_mask_reduce_operator(min_epu64);
|
||||
return __builtin_ia32_reduce_umin_q512(__V);
|
||||
}
|
||||
|
||||
static __inline__ long long __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __V) {
|
||||
__V = _mm512_mask_mov_epi64(_mm512_set1_epi64(-__LONG_LONG_MAX__ - 1LL), __M, __V);
|
||||
_mm512_mask_reduce_operator(max_epi64);
|
||||
return __builtin_ia32_reduce_smax_q512(__V);
|
||||
}
|
||||
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __V) {
|
||||
__V = _mm512_maskz_mov_epi64(__M, __V);
|
||||
_mm512_mask_reduce_operator(max_epu64);
|
||||
return __builtin_ia32_reduce_umax_q512(__V);
|
||||
}
|
||||
|
||||
static __inline__ long long __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __V) {
|
||||
__V = _mm512_mask_mov_epi64(_mm512_set1_epi64(__LONG_LONG_MAX__), __M, __V);
|
||||
_mm512_mask_reduce_operator(min_epi64);
|
||||
return __builtin_ia32_reduce_smin_q512(__V);
|
||||
}
|
||||
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) {
|
||||
__V = _mm512_mask_mov_epi64(_mm512_set1_epi64(~0ULL), __M, __V);
|
||||
_mm512_mask_reduce_operator(min_epu64);
|
||||
return __builtin_ia32_reduce_umin_q512(__V);
|
||||
}
|
||||
#undef _mm512_mask_reduce_operator
|
||||
|
||||
#define _mm512_mask_reduce_operator(op) \
|
||||
__m256i __t1 = _mm512_extracti64x4_epi64(__V, 0); \
|
||||
__m256i __t2 = _mm512_extracti64x4_epi64(__V, 1); \
|
||||
__m256i __t3 = _mm256_##op(__t1, __t2); \
|
||||
__m128i __t4 = _mm256_extracti128_si256(__t3, 0); \
|
||||
__m128i __t5 = _mm256_extracti128_si256(__t3, 1); \
|
||||
__m128i __t6 = _mm_##op(__t4, __t5); \
|
||||
__m128i __t7 = (__m128i)__builtin_shufflevector((__v4si)__t6, (__v4si)__t6, 2, 3, 0, 1); \
|
||||
__m128i __t8 = _mm_##op(__t6, __t7); \
|
||||
__m128i __t9 = (__m128i)__builtin_shufflevector((__v4si)__t8, (__v4si)__t8, 1, 0, 3, 2); \
|
||||
__v4si __t10 = (__v4si)_mm_##op(__t8, __t9); \
|
||||
return __t10[0]
|
||||
|
||||
static __inline__ int __DEFAULT_FN_ATTRS512
|
||||
_mm512_reduce_max_epi32(__m512i __V) {
|
||||
_mm512_mask_reduce_operator(max_epi32);
|
||||
return __builtin_ia32_reduce_smax_d512((__v16si)__V);
|
||||
}
|
||||
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS512
|
||||
_mm512_reduce_max_epu32(__m512i __V) {
|
||||
_mm512_mask_reduce_operator(max_epu32);
|
||||
return __builtin_ia32_reduce_umax_d512((__v16si)__V);
|
||||
}
|
||||
|
||||
static __inline__ int __DEFAULT_FN_ATTRS512
|
||||
_mm512_reduce_min_epi32(__m512i __V) {
|
||||
_mm512_mask_reduce_operator(min_epi32);
|
||||
return __builtin_ia32_reduce_smin_d512((__v16si)__V);
|
||||
}
|
||||
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS512
|
||||
_mm512_reduce_min_epu32(__m512i __V) {
|
||||
_mm512_mask_reduce_operator(min_epu32);
|
||||
return __builtin_ia32_reduce_umin_d512((__v16si)__V);
|
||||
}
|
||||
|
||||
static __inline__ int __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __V) {
|
||||
__V = _mm512_mask_mov_epi32(_mm512_set1_epi32(-__INT_MAX__ - 1), __M, __V);
|
||||
_mm512_mask_reduce_operator(max_epi32);
|
||||
return __builtin_ia32_reduce_smax_d512((__v16si)__V);
|
||||
}
|
||||
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __V) {
|
||||
__V = _mm512_maskz_mov_epi32(__M, __V);
|
||||
_mm512_mask_reduce_operator(max_epu32);
|
||||
return __builtin_ia32_reduce_umax_d512((__v16si)__V);
|
||||
}
|
||||
|
||||
static __inline__ int __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __V) {
|
||||
__V = _mm512_mask_mov_epi32(_mm512_set1_epi32(__INT_MAX__), __M, __V);
|
||||
_mm512_mask_reduce_operator(min_epi32);
|
||||
return __builtin_ia32_reduce_smin_d512((__v16si)__V);
|
||||
}
|
||||
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) {
|
||||
__V = _mm512_mask_mov_epi32(_mm512_set1_epi32(~0U), __M, __V);
|
||||
_mm512_mask_reduce_operator(min_epu32);
|
||||
return __builtin_ia32_reduce_umin_d512((__v16si)__V);
|
||||
}
|
||||
#undef _mm512_mask_reduce_operator
|
||||
|
||||
#define _mm512_mask_reduce_operator(op) \
|
||||
__m256d __t1 = _mm512_extractf64x4_pd(__V, 0); \
|
||||
|
||||
Vendored
+150
-55
@@ -18,13 +18,157 @@
|
||||
#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(128)))
|
||||
#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(256)))
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
|
||||
/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
|
||||
/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
|
||||
/// in \a S, and store the packed 32-bit results in DST.
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
|
||||
///
|
||||
/// \operation
|
||||
/// FOR j := 0 to 7
|
||||
/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
|
||||
/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
|
||||
/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
|
||||
/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
|
||||
/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
|
||||
/// ENDFOR
|
||||
/// DST[MAX:256] := 0
|
||||
/// \endoperation
|
||||
#define _mm256_dpbusd_epi32(S, A, B) \
|
||||
(__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B))
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v8si)__A,
|
||||
(__v8si)__B);
|
||||
}
|
||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
|
||||
/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
|
||||
/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
|
||||
/// in \a S using signed saturation, and store the packed 32-bit results in DST.
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
|
||||
///
|
||||
/// \operation
|
||||
/// FOR j := 0 to 7
|
||||
/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
|
||||
/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
|
||||
/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
|
||||
/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
|
||||
/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
|
||||
/// ENDFOR
|
||||
/// DST[MAX:256] := 0
|
||||
/// \endoperation
|
||||
#define _mm256_dpbusds_epi32(S, A, B) \
|
||||
(__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B))
|
||||
|
||||
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
|
||||
/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
|
||||
/// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
|
||||
/// and store the packed 32-bit results in DST.
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
|
||||
///
|
||||
/// \operation
|
||||
/// FOR j := 0 to 7
|
||||
/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
|
||||
/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
|
||||
/// DST.dword[j] := S.dword[j] + tmp1 + tmp2
|
||||
/// ENDFOR
|
||||
/// DST[MAX:256] := 0
|
||||
/// \endoperation
|
||||
#define _mm256_dpwssd_epi32(S, A, B) \
|
||||
(__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B))
|
||||
|
||||
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
|
||||
/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
|
||||
/// results. Sum these 2 results with the corresponding 32-bit integer in \a S
|
||||
/// using signed saturation, and store the packed 32-bit results in DST.
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
|
||||
///
|
||||
/// \operation
|
||||
/// FOR j := 0 to 7
|
||||
/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
|
||||
/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
|
||||
/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
|
||||
/// ENDFOR
|
||||
/// DST[MAX:256] := 0
|
||||
/// \endoperation
|
||||
#define _mm256_dpwssds_epi32(S, A, B) \
|
||||
(__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B))
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
|
||||
/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
|
||||
/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
|
||||
/// in \a S, and store the packed 32-bit results in DST.
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
|
||||
///
|
||||
/// \operation
|
||||
/// FOR j := 0 to 3
|
||||
/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
|
||||
/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
|
||||
/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
|
||||
/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
|
||||
/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
|
||||
/// ENDFOR
|
||||
/// DST[MAX:128] := 0
|
||||
/// \endoperation
|
||||
#define _mm_dpbusd_epi32(S, A, B) \
|
||||
(__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B))
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
|
||||
/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
|
||||
/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
|
||||
/// in \a S using signed saturation, and store the packed 32-bit results in DST.
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
|
||||
///
|
||||
/// \operation
|
||||
/// FOR j := 0 to 3
|
||||
/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
|
||||
/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
|
||||
/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
|
||||
/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
|
||||
/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
|
||||
/// ENDFOR
|
||||
/// DST[MAX:128] := 0
|
||||
/// \endoperation
|
||||
#define _mm_dpbusds_epi32(S, A, B) \
|
||||
(__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B))
|
||||
|
||||
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
|
||||
/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
|
||||
/// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
|
||||
/// and store the packed 32-bit results in DST.
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
|
||||
///
|
||||
/// \operation
|
||||
/// FOR j := 0 to 3
|
||||
/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
|
||||
/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
|
||||
/// DST.dword[j] := S.dword[j] + tmp1 + tmp2
|
||||
/// ENDFOR
|
||||
/// DST[MAX:128] := 0
|
||||
/// \endoperation
|
||||
#define _mm_dpwssd_epi32(S, A, B) \
|
||||
(__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B))
|
||||
|
||||
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
|
||||
/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
|
||||
/// results. Sum these 2 results with the corresponding 32-bit integer in \a S
|
||||
/// using signed saturation, and store the packed 32-bit results in DST.
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
|
||||
///
|
||||
/// \operation
|
||||
/// FOR j := 0 to 3
|
||||
/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
|
||||
/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
|
||||
/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
|
||||
/// ENDFOR
|
||||
/// DST[MAX:128] := 0
|
||||
/// \endoperation
|
||||
#define _mm_dpwssds_epi32(S, A, B) \
|
||||
(__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B))
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
|
||||
@@ -42,13 +186,6 @@ _mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
|
||||
(__v8si)_mm256_setzero_si256());
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v8si)__A,
|
||||
(__v8si)__B);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
|
||||
{
|
||||
@@ -65,13 +202,6 @@ _mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
|
||||
(__v8si)_mm256_setzero_si256());
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v8si)__A,
|
||||
(__v8si)__B);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
|
||||
{
|
||||
@@ -88,13 +218,6 @@ _mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
|
||||
(__v8si)_mm256_setzero_si256());
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v8si)__A,
|
||||
(__v8si)__B);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
|
||||
{
|
||||
@@ -111,13 +234,6 @@ _mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
|
||||
(__v8si)_mm256_setzero_si256());
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v4si)__A,
|
||||
(__v4si)__B);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
|
||||
{
|
||||
@@ -134,13 +250,6 @@ _mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
|
||||
(__v4si)_mm_setzero_si128());
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v4si)__A,
|
||||
(__v4si)__B);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
|
||||
{
|
||||
@@ -157,13 +266,6 @@ _mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
|
||||
(__v4si)_mm_setzero_si128());
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v4si)__A,
|
||||
(__v4si)__B);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
|
||||
{
|
||||
@@ -180,13 +282,6 @@ _mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
|
||||
(__v4si)_mm_setzero_si128());
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v4si)__A,
|
||||
(__v4si)__B);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
|
||||
{
|
||||
|
||||
Vendored
+225
@@ -0,0 +1,225 @@
|
||||
/*===--------------- avxvnniintrin.h - VNNI intrinsics --------------------===
|
||||
*
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error "Never use <avxvnniintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef __AVXVNNIINTRIN_H
|
||||
#define __AVXVNNIINTRIN_H
|
||||
|
||||
/* Below intrinsics defined in avx512vlvnniintrin.h can be used for AVXVNNI */
|
||||
/// \fn __m256i _mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B)
|
||||
/// \fn __m256i _mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B)
|
||||
/// \fn __m256i _mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B)
|
||||
/// \fn __m256i _mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B)
|
||||
/// \fn __m128i _mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B)
|
||||
/// \fn __m128i _mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B)
|
||||
/// \fn __m128i _mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B)
|
||||
/// \fn __m128i _mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B)
|
||||
|
||||
/* Intrinsics with _avx_ prefix are for compatibility with msvc. */
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(256)))
|
||||
#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(128)))
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
|
||||
/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
|
||||
/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
|
||||
/// in \a __S, and store the packed 32-bit results in DST.
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
|
||||
///
|
||||
/// \operation
|
||||
/// FOR j := 0 to 7
|
||||
/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
|
||||
/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
|
||||
/// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
|
||||
/// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
|
||||
/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
|
||||
/// ENDFOR
|
||||
/// DST[MAX:256] := 0
|
||||
/// \endoperation
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v8si)__A, (__v8si)__B);
|
||||
}
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
|
||||
/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
|
||||
/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
|
||||
/// in \a __S using signed saturation, and store the packed 32-bit results in DST.
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
|
||||
///
|
||||
/// \operation
|
||||
/// FOR j := 0 to 7
|
||||
/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
|
||||
/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
|
||||
/// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
|
||||
/// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
|
||||
/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
|
||||
/// ENDFOR
|
||||
/// DST[MAX:256] := 0
|
||||
/// \endoperation
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v8si)__A, (__v8si)__B);
|
||||
}
|
||||
|
||||
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
|
||||
/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
|
||||
/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S,
|
||||
/// and store the packed 32-bit results in DST.
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
|
||||
///
|
||||
/// \operation
|
||||
/// FOR j := 0 to 7
|
||||
/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
|
||||
/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
|
||||
/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2
|
||||
/// ENDFOR
|
||||
/// DST[MAX:256] := 0
|
||||
/// \endoperation
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v8si)__A, (__v8si)__B);
|
||||
}
|
||||
|
||||
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
|
||||
/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
|
||||
/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S
|
||||
/// using signed saturation, and store the packed 32-bit results in DST.
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
|
||||
///
|
||||
/// \operation
|
||||
/// FOR j := 0 to 7
|
||||
/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
|
||||
/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
|
||||
/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
|
||||
/// ENDFOR
|
||||
/// DST[MAX:256] := 0
|
||||
/// \endoperation
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v8si)__A, (__v8si)__B);
|
||||
}
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
|
||||
/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
|
||||
/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
|
||||
/// in \a __S, and store the packed 32-bit results in DST.
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
|
||||
///
|
||||
/// \operation
|
||||
/// FOR j := 0 to 3
|
||||
/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
|
||||
/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
|
||||
/// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
|
||||
/// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
|
||||
/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
|
||||
/// ENDFOR
|
||||
/// DST[MAX:128] := 0
|
||||
/// \endoperation
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v4si)__A, (__v4si)__B);
|
||||
}
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
|
||||
/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
|
||||
/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
|
||||
/// in \a __S using signed saturation, and store the packed 32-bit results in DST.
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
|
||||
///
|
||||
/// \operation
|
||||
/// FOR j := 0 to 3
|
||||
/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
|
||||
/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
|
||||
/// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
|
||||
/// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
|
||||
/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
|
||||
/// ENDFOR
|
||||
/// DST[MAX:128] := 0
|
||||
/// \endoperation
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v4si)__A, (__v4si)__B);
|
||||
}
|
||||
|
||||
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
|
||||
/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
|
||||
/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S,
|
||||
/// and store the packed 32-bit results in DST.
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
|
||||
///
|
||||
/// \operation
|
||||
/// FOR j := 0 to 3
|
||||
/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
|
||||
/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
|
||||
/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2
|
||||
/// ENDFOR
|
||||
/// DST[MAX:128] := 0
|
||||
/// \endoperation
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v4si)__A, (__v4si)__B);
|
||||
}
|
||||
|
||||
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
|
||||
/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
|
||||
/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S
|
||||
/// using signed saturation, and store the packed 32-bit results in DST.
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
|
||||
///
|
||||
/// \operation
|
||||
/// FOR j := 0 to 3
|
||||
/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
|
||||
/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
|
||||
/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
|
||||
/// ENDFOR
|
||||
/// DST[MAX:128] := 0
|
||||
/// \endoperation
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v4si)__A, (__v4si)__B);
|
||||
}
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS128
|
||||
#undef __DEFAULT_FN_ATTRS256
|
||||
|
||||
#endif // __AVXVNNIINTRIN_H
|
||||
Vendored
+8
@@ -7,6 +7,9 @@
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __CPUID_H
|
||||
#define __CPUID_H
|
||||
|
||||
#if !(__x86_64__ || __i386__)
|
||||
#error this header is for x86 only
|
||||
#endif
|
||||
@@ -186,6 +189,7 @@
|
||||
/* Features in %edx for leaf 7 sub-leaf 0 */
|
||||
#define bit_AVX5124VNNIW 0x00000004
|
||||
#define bit_AVX5124FMAPS 0x00000008
|
||||
#define bit_UINTR 0x00000020
|
||||
#define bit_SERIALIZE 0x00004000
|
||||
#define bit_TSXLDTRK 0x00010000
|
||||
#define bit_PCONFIG 0x00040000
|
||||
@@ -195,7 +199,9 @@
|
||||
#define bit_AMXINT8 0x02000000
|
||||
|
||||
/* Features in %eax for leaf 7 sub-leaf 1 */
|
||||
#define bit_AVXVNNI 0x00000008
|
||||
#define bit_AVX512BF16 0x00000020
|
||||
#define bit_HRESET 0x00400000
|
||||
|
||||
/* Features in %eax for leaf 13 sub-leaf 1 */
|
||||
#define bit_XSAVEOPT 0x00000001
|
||||
@@ -309,3 +315,5 @@ static __inline int __get_cpuid_count (unsigned int __leaf,
|
||||
__cpuid_count(__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx);
|
||||
return 1;
|
||||
}
|
||||
|
||||
#endif /* __CPUID_H */
|
||||
|
||||
@@ -26,6 +26,13 @@
|
||||
|
||||
#include_next <new>
|
||||
|
||||
#if !defined(__device__)
|
||||
// The header has been included too early from the standard C++ library
|
||||
// and CUDA-specific macros are not available yet.
|
||||
// Undo the include guard and try again later.
|
||||
#undef __CLANG_CUDA_WRAPPERS_NEW
|
||||
#else
|
||||
|
||||
#pragma push_macro("CUDA_NOEXCEPT")
|
||||
#if __cplusplus >= 201103L
|
||||
#define CUDA_NOEXCEPT noexcept
|
||||
@@ -95,4 +102,5 @@ __device__ inline void operator delete[](void *, void *) CUDA_NOEXCEPT {}
|
||||
|
||||
#pragma pop_macro("CUDA_NOEXCEPT")
|
||||
|
||||
#endif // __device__
|
||||
#endif // include guard
|
||||
|
||||
Vendored
+104
-105
@@ -14,38 +14,56 @@
|
||||
#ifndef __GFNIINTRIN_H
|
||||
#define __GFNIINTRIN_H
|
||||
|
||||
/* Default attributes for simple form (no masking). */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("gfni"), __min_vector_width__(128)))
|
||||
|
||||
/* Default attributes for YMM unmasked form. */
|
||||
#define __DEFAULT_FN_ATTRS_Y __attribute__((__always_inline__, __nodebug__, __target__("avx,gfni"), __min_vector_width__(256)))
|
||||
|
||||
/* Default attributes for ZMM forms. */
|
||||
#define __DEFAULT_FN_ATTRS_Z __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,gfni"), __min_vector_width__(512)))
|
||||
|
||||
/* Default attributes for VLX forms. */
|
||||
#define __DEFAULT_FN_ATTRS_VL128 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(128)))
|
||||
#define __DEFAULT_FN_ATTRS_VL256 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(256)))
|
||||
|
||||
#define _mm_gf2p8affineinv_epi64_epi8(A, B, I) \
|
||||
(__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A), \
|
||||
(__v16qi)(__m128i)(B), \
|
||||
(char)(I))
|
||||
|
||||
#define _mm_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
|
||||
(__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
|
||||
(__v16qi)_mm_gf2p8affineinv_epi64_epi8(A, B, I), \
|
||||
(__v16qi)(__m128i)(S))
|
||||
|
||||
|
||||
#define _mm_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
|
||||
(__m128i)_mm_mask_gf2p8affineinv_epi64_epi8((__m128i)_mm_setzero_si128(), \
|
||||
U, A, B, I)
|
||||
#define _mm_gf2p8affine_epi64_epi8(A, B, I) \
|
||||
(__m128i)__builtin_ia32_vgf2p8affineqb_v16qi((__v16qi)(__m128i)(A), \
|
||||
(__v16qi)(__m128i)(B), \
|
||||
(char)(I))
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_gf2p8mul_epi8(__m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vgf2p8mulb_v16qi((__v16qi) __A,
|
||||
(__v16qi) __B);
|
||||
}
|
||||
|
||||
#ifdef __AVXINTRIN_H
|
||||
#define _mm256_gf2p8affineinv_epi64_epi8(A, B, I) \
|
||||
(__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)(__m256i)(A), \
|
||||
(__v32qi)(__m256i)(B), \
|
||||
(char)(I))
|
||||
|
||||
#define _mm256_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
|
||||
(__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
|
||||
(__v32qi)_mm256_gf2p8affineinv_epi64_epi8(A, B, I), \
|
||||
(__v32qi)(__m256i)(S))
|
||||
|
||||
#define _mm256_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
|
||||
(__m256i)_mm256_mask_gf2p8affineinv_epi64_epi8((__m256i)_mm256_setzero_si256(), \
|
||||
U, A, B, I)
|
||||
#define _mm256_gf2p8affine_epi64_epi8(A, B, I) \
|
||||
(__m256i)__builtin_ia32_vgf2p8affineqb_v32qi((__v32qi)(__m256i)(A), \
|
||||
(__v32qi)(__m256i)(B), \
|
||||
(char)(I))
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS_Y
|
||||
_mm256_gf2p8mul_epi8(__m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi((__v32qi) __A,
|
||||
(__v32qi) __B);
|
||||
}
|
||||
#endif /* __AVXINTRIN_H */
|
||||
|
||||
#ifdef __AVX512BWINTRIN_H
|
||||
#define _mm512_gf2p8affineinv_epi64_epi8(A, B, I) \
|
||||
(__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi((__v64qi)(__m512i)(A), \
|
||||
(__v64qi)(__m512i)(B), \
|
||||
@@ -60,37 +78,6 @@
|
||||
(__m512i)_mm512_mask_gf2p8affineinv_epi64_epi8((__m512i)_mm512_setzero_si512(), \
|
||||
U, A, B, I)
|
||||
|
||||
#define _mm_gf2p8affine_epi64_epi8(A, B, I) \
|
||||
(__m128i)__builtin_ia32_vgf2p8affineqb_v16qi((__v16qi)(__m128i)(A), \
|
||||
(__v16qi)(__m128i)(B), \
|
||||
(char)(I))
|
||||
|
||||
#define _mm_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
|
||||
(__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
|
||||
(__v16qi)_mm_gf2p8affine_epi64_epi8(A, B, I), \
|
||||
(__v16qi)(__m128i)(S))
|
||||
|
||||
|
||||
#define _mm_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
|
||||
(__m128i)_mm_mask_gf2p8affine_epi64_epi8((__m128i)_mm_setzero_si128(), \
|
||||
U, A, B, I)
|
||||
|
||||
|
||||
#define _mm256_gf2p8affine_epi64_epi8(A, B, I) \
|
||||
(__m256i)__builtin_ia32_vgf2p8affineqb_v32qi((__v32qi)(__m256i)(A), \
|
||||
(__v32qi)(__m256i)(B), \
|
||||
(char)(I))
|
||||
|
||||
#define _mm256_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
|
||||
(__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
|
||||
(__v32qi)_mm256_gf2p8affine_epi64_epi8(A, B, I), \
|
||||
(__v32qi)(__m256i)(S))
|
||||
|
||||
#define _mm256_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
|
||||
(__m256i)_mm256_mask_gf2p8affine_epi64_epi8((__m256i)_mm256_setzero_si256(), \
|
||||
U, A, B, I)
|
||||
|
||||
|
||||
#define _mm512_gf2p8affine_epi64_epi8(A, B, I) \
|
||||
(__m512i)__builtin_ia32_vgf2p8affineqb_v64qi((__v64qi)(__m512i)(A), \
|
||||
(__v64qi)(__m512i)(B), \
|
||||
@@ -105,63 +92,6 @@
|
||||
(__m512i)_mm512_mask_gf2p8affine_epi64_epi8((__m512i)_mm512_setzero_si512(), \
|
||||
U, A, B, I)
|
||||
|
||||
/* Default attributes for simple form (no masking). */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("gfni"), __min_vector_width__(128)))
|
||||
|
||||
/* Default attributes for YMM unmasked form. */
|
||||
#define __DEFAULT_FN_ATTRS_Y __attribute__((__always_inline__, __nodebug__, __target__("avx,gfni"), __min_vector_width__(256)))
|
||||
|
||||
/* Default attributes for ZMM forms. */
|
||||
#define __DEFAULT_FN_ATTRS_Z __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,gfni"), __min_vector_width__(512)))
|
||||
|
||||
/* Default attributes for VLX forms. */
|
||||
#define __DEFAULT_FN_ATTRS_VL128 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(128)))
|
||||
#define __DEFAULT_FN_ATTRS_VL256 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(256)))
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_gf2p8mul_epi8(__m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vgf2p8mulb_v16qi((__v16qi) __A,
|
||||
(__v16qi) __B);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128
|
||||
_mm_mask_gf2p8mul_epi8(__m128i __S, __mmask16 __U, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_selectb_128(__U,
|
||||
(__v16qi) _mm_gf2p8mul_epi8(__A, __B),
|
||||
(__v16qi) __S);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128
|
||||
_mm_maskz_gf2p8mul_epi8(__mmask16 __U, __m128i __A, __m128i __B)
|
||||
{
|
||||
return _mm_mask_gf2p8mul_epi8((__m128i)_mm_setzero_si128(),
|
||||
__U, __A, __B);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS_Y
|
||||
_mm256_gf2p8mul_epi8(__m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi((__v32qi) __A,
|
||||
(__v32qi) __B);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS_VL256
|
||||
_mm256_mask_gf2p8mul_epi8(__m256i __S, __mmask32 __U, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_selectb_256(__U,
|
||||
(__v32qi) _mm256_gf2p8mul_epi8(__A, __B),
|
||||
(__v32qi) __S);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS_VL256
|
||||
_mm256_maskz_gf2p8mul_epi8(__mmask32 __U, __m256i __A, __m256i __B)
|
||||
{
|
||||
return _mm256_mask_gf2p8mul_epi8((__m256i)_mm256_setzero_si256(),
|
||||
__U, __A, __B);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS_Z
|
||||
_mm512_gf2p8mul_epi8(__m512i __A, __m512i __B)
|
||||
{
|
||||
@@ -183,6 +113,75 @@ _mm512_maskz_gf2p8mul_epi8(__mmask64 __U, __m512i __A, __m512i __B)
|
||||
return _mm512_mask_gf2p8mul_epi8((__m512i)_mm512_setzero_si512(),
|
||||
__U, __A, __B);
|
||||
}
|
||||
#endif /* __AVX512BWINTRIN_H */
|
||||
|
||||
#ifdef __AVX512VLBWINTRIN_H
|
||||
#define _mm_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
|
||||
(__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
|
||||
(__v16qi)_mm_gf2p8affineinv_epi64_epi8(A, B, I), \
|
||||
(__v16qi)(__m128i)(S))
|
||||
|
||||
#define _mm_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
|
||||
(__m128i)_mm_mask_gf2p8affineinv_epi64_epi8((__m128i)_mm_setzero_si128(), \
|
||||
U, A, B, I)
|
||||
|
||||
#define _mm256_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
|
||||
(__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
|
||||
(__v32qi)_mm256_gf2p8affineinv_epi64_epi8(A, B, I), \
|
||||
(__v32qi)(__m256i)(S))
|
||||
|
||||
#define _mm256_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
|
||||
(__m256i)_mm256_mask_gf2p8affineinv_epi64_epi8((__m256i)_mm256_setzero_si256(), \
|
||||
U, A, B, I)
|
||||
|
||||
#define _mm_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
|
||||
(__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
|
||||
(__v16qi)_mm_gf2p8affine_epi64_epi8(A, B, I), \
|
||||
(__v16qi)(__m128i)(S))
|
||||
|
||||
#define _mm_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
|
||||
(__m128i)_mm_mask_gf2p8affine_epi64_epi8((__m128i)_mm_setzero_si128(), \
|
||||
U, A, B, I)
|
||||
|
||||
#define _mm256_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
|
||||
(__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
|
||||
(__v32qi)_mm256_gf2p8affine_epi64_epi8(A, B, I), \
|
||||
(__v32qi)(__m256i)(S))
|
||||
|
||||
#define _mm256_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
|
||||
(__m256i)_mm256_mask_gf2p8affine_epi64_epi8((__m256i)_mm256_setzero_si256(), \
|
||||
U, A, B, I)
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128
|
||||
_mm_mask_gf2p8mul_epi8(__m128i __S, __mmask16 __U, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_selectb_128(__U,
|
||||
(__v16qi) _mm_gf2p8mul_epi8(__A, __B),
|
||||
(__v16qi) __S);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128
|
||||
_mm_maskz_gf2p8mul_epi8(__mmask16 __U, __m128i __A, __m128i __B)
|
||||
{
|
||||
return _mm_mask_gf2p8mul_epi8((__m128i)_mm_setzero_si128(),
|
||||
__U, __A, __B);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS_VL256
|
||||
_mm256_mask_gf2p8mul_epi8(__m256i __S, __mmask32 __U, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_selectb_256(__U,
|
||||
(__v32qi) _mm256_gf2p8mul_epi8(__A, __B),
|
||||
(__v32qi) __S);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS_VL256
|
||||
_mm256_maskz_gf2p8mul_epi8(__mmask32 __U, __m256i __A, __m256i __B)
|
||||
{
|
||||
return _mm256_mask_gf2p8mul_epi8((__m256i)_mm256_setzero_si256(),
|
||||
__U, __A, __B);
|
||||
}
|
||||
#endif /* __AVX512VLBWINTRIN_H */
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
#undef __DEFAULT_FN_ATTRS_Y
|
||||
|
||||
Vendored
+49
@@ -0,0 +1,49 @@
|
||||
/*===---------------- hresetintrin.h - HRESET intrinsics -------------------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __X86GPRINTRIN_H
|
||||
#error "Never use <hresetintrin.h> directly; include <x86gprintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef __HRESETINTRIN_H
|
||||
#define __HRESETINTRIN_H
|
||||
|
||||
#if __has_extension(gnu_asm)
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("hreset")))
|
||||
|
||||
/// Provides a hint to the processor to selectively reset the prediction
|
||||
/// history of the current logical processor specified by a 32-bit integer
|
||||
/// value \a __eax.
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> HRESET </c> instruction.
|
||||
///
|
||||
/// \operation
|
||||
/// IF __eax == 0
|
||||
/// // nop
|
||||
/// ELSE
|
||||
/// FOR i := 0 to 31
|
||||
/// IF __eax[i]
|
||||
/// ResetPredictionFeature(i)
|
||||
/// FI
|
||||
/// ENDFOR
|
||||
/// FI
|
||||
/// \endoperation
|
||||
static __inline void __DEFAULT_FN_ATTRS
|
||||
_hreset(int __eax)
|
||||
{
|
||||
__asm__ ("hreset $0" :: "a"(__eax));
|
||||
}
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif /* __has_extension(gnu_asm) */
|
||||
|
||||
#endif /* __HRESETINTRIN_H */
|
||||
Vendored
+53
-44
@@ -14,6 +14,18 @@
|
||||
#ifndef __IA32INTRIN_H
|
||||
#define __IA32INTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
|
||||
#define __DEFAULT_FN_ATTRS_SSE42 __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
|
||||
|
||||
#if defined(__cplusplus) && (__cplusplus >= 201103L)
|
||||
#define __DEFAULT_FN_ATTRS_CAST __attribute__((__always_inline__)) constexpr
|
||||
#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
|
||||
#else
|
||||
#define __DEFAULT_FN_ATTRS_CAST __attribute__((__always_inline__))
|
||||
#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
|
||||
#endif
|
||||
|
||||
/** Find the first set bit starting from the lsb. Result is undefined if
|
||||
* input is 0.
|
||||
*
|
||||
@@ -26,7 +38,7 @@
|
||||
* A 32-bit integer operand.
|
||||
* \returns A 32-bit integer containing the bit number.
|
||||
*/
|
||||
static __inline__ int __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
__bsfd(int __A) {
|
||||
return __builtin_ctz(__A);
|
||||
}
|
||||
@@ -43,7 +55,7 @@ __bsfd(int __A) {
|
||||
* A 32-bit integer operand.
|
||||
* \returns A 32-bit integer containing the bit number.
|
||||
*/
|
||||
static __inline__ int __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
__bsrd(int __A) {
|
||||
return 31 - __builtin_clz(__A);
|
||||
}
|
||||
@@ -59,12 +71,12 @@ __bsrd(int __A) {
|
||||
* A 32-bit integer operand.
|
||||
* \returns A 32-bit integer containing the swapped bytes.
|
||||
*/
|
||||
static __inline__ int __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
__bswapd(int __A) {
|
||||
return __builtin_bswap32(__A);
|
||||
}
|
||||
|
||||
static __inline__ int __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_bswap(int __A) {
|
||||
return __builtin_bswap32(__A);
|
||||
}
|
||||
@@ -85,7 +97,7 @@ _bswap(int __A) {
|
||||
* A 64-bit integer operand.
|
||||
* \returns A 32-bit integer containing the bit number.
|
||||
*/
|
||||
static __inline__ int __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
__bsfq(long long __A) {
|
||||
return __builtin_ctzll(__A);
|
||||
}
|
||||
@@ -102,7 +114,7 @@ __bsfq(long long __A) {
|
||||
* A 64-bit integer operand.
|
||||
* \returns A 32-bit integer containing the bit number.
|
||||
*/
|
||||
static __inline__ int __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
__bsrq(long long __A) {
|
||||
return 63 - __builtin_clzll(__A);
|
||||
}
|
||||
@@ -118,7 +130,7 @@ __bsrq(long long __A) {
|
||||
* A 64-bit integer operand.
|
||||
* \returns A 64-bit integer containing the swapped bytes.
|
||||
*/
|
||||
static __inline__ long long __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
__bswapq(long long __A) {
|
||||
return __builtin_bswap64(__A);
|
||||
}
|
||||
@@ -138,7 +150,7 @@ __bswapq(long long __A) {
|
||||
* \returns A 32-bit integer containing the number of bits with value 1 in the
|
||||
* source operand.
|
||||
*/
|
||||
static __inline__ int __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
__popcntd(unsigned int __A)
|
||||
{
|
||||
return __builtin_popcount(__A);
|
||||
@@ -159,7 +171,7 @@ __popcntd(unsigned int __A)
|
||||
* \returns A 64-bit integer containing the number of bits with value 1 in the
|
||||
* source operand.
|
||||
*/
|
||||
static __inline__ long long __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
__popcntq(unsigned long long __A)
|
||||
{
|
||||
return __builtin_popcountll(__A);
|
||||
@@ -169,26 +181,26 @@ __popcntq(unsigned long long __A)
|
||||
#endif /* __x86_64__ */
|
||||
|
||||
#ifdef __x86_64__
|
||||
static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
||||
__readeflags(void)
|
||||
{
|
||||
return __builtin_ia32_readeflags_u64();
|
||||
}
|
||||
|
||||
static __inline__ void __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
__writeeflags(unsigned long long __f)
|
||||
{
|
||||
__builtin_ia32_writeeflags_u64(__f);
|
||||
}
|
||||
|
||||
#else /* !__x86_64__ */
|
||||
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
__readeflags(void)
|
||||
{
|
||||
return __builtin_ia32_readeflags_u32();
|
||||
}
|
||||
|
||||
static __inline__ void __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
__writeeflags(unsigned int __f)
|
||||
{
|
||||
__builtin_ia32_writeeflags_u32(__f);
|
||||
@@ -205,11 +217,9 @@ __writeeflags(unsigned int __f)
|
||||
* A 32-bit float value.
|
||||
* \returns a 32-bit unsigned integer containing the converted value.
|
||||
*/
|
||||
static __inline__ unsigned int __attribute__((__always_inline__))
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS_CAST
|
||||
_castf32_u32(float __A) {
|
||||
unsigned int D;
|
||||
__builtin_memcpy(&D, &__A, sizeof(__A));
|
||||
return D;
|
||||
return __builtin_bit_cast(unsigned int, __A);
|
||||
}
|
||||
|
||||
/** Cast a 64-bit float value to a 64-bit unsigned integer value
|
||||
@@ -222,11 +232,9 @@ _castf32_u32(float __A) {
|
||||
* A 64-bit float value.
|
||||
* \returns a 64-bit unsigned integer containing the converted value.
|
||||
*/
|
||||
static __inline__ unsigned long long __attribute__((__always_inline__))
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CAST
|
||||
_castf64_u64(double __A) {
|
||||
unsigned long long D;
|
||||
__builtin_memcpy(&D, &__A, sizeof(__A));
|
||||
return D;
|
||||
return __builtin_bit_cast(unsigned long long, __A);
|
||||
}
|
||||
|
||||
/** Cast a 32-bit unsigned integer value to a 32-bit float value
|
||||
@@ -239,11 +247,9 @@ _castf64_u64(double __A) {
|
||||
* A 32-bit unsigned integer value.
|
||||
* \returns a 32-bit float value containing the converted value.
|
||||
*/
|
||||
static __inline__ float __attribute__((__always_inline__))
|
||||
static __inline__ float __DEFAULT_FN_ATTRS_CAST
|
||||
_castu32_f32(unsigned int __A) {
|
||||
float D;
|
||||
__builtin_memcpy(&D, &__A, sizeof(__A));
|
||||
return D;
|
||||
return __builtin_bit_cast(float, __A);
|
||||
}
|
||||
|
||||
/** Cast a 64-bit unsigned integer value to a 64-bit float value
|
||||
@@ -256,11 +262,9 @@ _castu32_f32(unsigned int __A) {
|
||||
* A 64-bit unsigned integer value.
|
||||
* \returns a 64-bit float value containing the converted value.
|
||||
*/
|
||||
static __inline__ double __attribute__((__always_inline__))
|
||||
static __inline__ double __DEFAULT_FN_ATTRS_CAST
|
||||
_castu64_f64(unsigned long long __A) {
|
||||
double D;
|
||||
__builtin_memcpy(&D, &__A, sizeof(__A));
|
||||
return D;
|
||||
return __builtin_bit_cast(double, __A);
|
||||
}
|
||||
|
||||
/** Adds the unsigned integer operand to the CRC-32C checksum of the
|
||||
@@ -278,7 +282,7 @@ _castu64_f64(unsigned long long __A) {
|
||||
* \returns The result of adding operand \a __C to the CRC-32C checksum of
|
||||
* operand \a __D.
|
||||
*/
|
||||
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS_SSE42
|
||||
__crc32b(unsigned int __C, unsigned char __D)
|
||||
{
|
||||
return __builtin_ia32_crc32qi(__C, __D);
|
||||
@@ -299,7 +303,7 @@ __crc32b(unsigned int __C, unsigned char __D)
|
||||
* \returns The result of adding operand \a __C to the CRC-32C checksum of
|
||||
* operand \a __D.
|
||||
*/
|
||||
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS_SSE42
|
||||
__crc32w(unsigned int __C, unsigned short __D)
|
||||
{
|
||||
return __builtin_ia32_crc32hi(__C, __D);
|
||||
@@ -320,7 +324,7 @@ __crc32w(unsigned int __C, unsigned short __D)
|
||||
* \returns The result of adding operand \a __C to the CRC-32C checksum of
|
||||
* operand \a __D.
|
||||
*/
|
||||
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS_SSE42
|
||||
__crc32d(unsigned int __C, unsigned int __D)
|
||||
{
|
||||
return __builtin_ia32_crc32si(__C, __D);
|
||||
@@ -342,20 +346,20 @@ __crc32d(unsigned int __C, unsigned int __D)
|
||||
* \returns The result of adding operand \a __C to the CRC-32C checksum of
|
||||
* operand \a __D.
|
||||
*/
|
||||
static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS_SSE42
|
||||
__crc32q(unsigned long long __C, unsigned long long __D)
|
||||
{
|
||||
return __builtin_ia32_crc32di(__C, __D);
|
||||
}
|
||||
#endif /* __x86_64__ */
|
||||
|
||||
static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
||||
__rdpmc(int __A) {
|
||||
return __builtin_ia32_rdpmc(__A);
|
||||
}
|
||||
|
||||
/* __rdtscp */
|
||||
static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
||||
__rdtscp(unsigned int *__A) {
|
||||
return __builtin_ia32_rdtscp(__A);
|
||||
}
|
||||
@@ -364,48 +368,48 @@ __rdtscp(unsigned int *__A) {
|
||||
|
||||
#define _rdpmc(A) __rdpmc(A)
|
||||
|
||||
static __inline__ void __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_wbinvd(void) {
|
||||
__builtin_ia32_wbinvd();
|
||||
}
|
||||
|
||||
static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
__rolb(unsigned char __X, int __C) {
|
||||
return __builtin_rotateleft8(__X, __C);
|
||||
}
|
||||
|
||||
static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
__rorb(unsigned char __X, int __C) {
|
||||
return __builtin_rotateright8(__X, __C);
|
||||
}
|
||||
|
||||
static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ unsigned short __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
__rolw(unsigned short __X, int __C) {
|
||||
return __builtin_rotateleft16(__X, __C);
|
||||
}
|
||||
|
||||
static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ unsigned short __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
__rorw(unsigned short __X, int __C) {
|
||||
return __builtin_rotateright16(__X, __C);
|
||||
}
|
||||
|
||||
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
__rold(unsigned int __X, int __C) {
|
||||
return __builtin_rotateleft32(__X, __C);
|
||||
}
|
||||
|
||||
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
__rord(unsigned int __X, int __C) {
|
||||
return __builtin_rotateright32(__X, __C);
|
||||
}
|
||||
|
||||
#ifdef __x86_64__
|
||||
static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
__rolq(unsigned long long __X, int __C) {
|
||||
return __builtin_rotateleft64(__X, __C);
|
||||
}
|
||||
|
||||
static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
__rorq(unsigned long long __X, int __C) {
|
||||
return __builtin_rotateright64(__X, __C);
|
||||
}
|
||||
@@ -429,4 +433,9 @@ __rorq(unsigned long long __X, int __C) {
|
||||
#define _rotwl(a,b) __rolw((a), (b))
|
||||
#define _rotwr(a,b) __rorw((a), (b))
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
#undef __DEFAULT_FN_ATTRS_CAST
|
||||
#undef __DEFAULT_FN_ATTRS_SSE42
|
||||
#undef __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
|
||||
#endif /* __IA32INTRIN_H */
|
||||
|
||||
Vendored
+12
@@ -10,6 +10,8 @@
|
||||
#ifndef __IMMINTRIN_H
|
||||
#define __IMMINTRIN_H
|
||||
|
||||
#include <x86gprintrin.h>
|
||||
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__MMX__)
|
||||
#include <mmintrin.h>
|
||||
@@ -143,6 +145,11 @@
|
||||
#include <avx512vlvnniintrin.h>
|
||||
#endif
|
||||
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__AVXVNNI__)
|
||||
#include <avxvnniintrin.h>
|
||||
#endif
|
||||
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__AVX512DQ__)
|
||||
#include <avx512dqintrin.h>
|
||||
@@ -471,6 +478,11 @@ _storebe_i64(void * __P, long long __D) {
|
||||
#include <invpcidintrin.h>
|
||||
#endif
|
||||
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__KL__) || defined(__WIDEKL__)
|
||||
#include <keylockerintrin.h>
|
||||
#endif
|
||||
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__AMXTILE__) || defined(__AMXINT8__) || defined(__AMXBF16__)
|
||||
#include <amxintrin.h>
|
||||
|
||||
Vendored
+80
-93
@@ -57,16 +57,11 @@ void __addfsbyte(unsigned long, unsigned char);
|
||||
void __addfsdword(unsigned long, unsigned long);
|
||||
void __addfsword(unsigned long, unsigned short);
|
||||
void __code_seg(const char *);
|
||||
static __inline__
|
||||
void __cpuid(int[4], int);
|
||||
static __inline__
|
||||
void __cpuidex(int[4], int, int);
|
||||
static __inline__
|
||||
__int64 __emul(int, int);
|
||||
static __inline__
|
||||
unsigned __int64 __emulu(unsigned int, unsigned int);
|
||||
unsigned int __getcallerseflags(void);
|
||||
static __inline__
|
||||
void __halt(void);
|
||||
unsigned char __inbyte(unsigned short);
|
||||
void __inbytestring(unsigned short, unsigned char *, unsigned long);
|
||||
@@ -82,13 +77,9 @@ void __inwordstring(unsigned short, unsigned short *, unsigned long);
|
||||
void __lidt(void *);
|
||||
unsigned __int64 __ll_lshift(unsigned __int64, int);
|
||||
__int64 __ll_rshift(__int64, int);
|
||||
static __inline__
|
||||
void __movsb(unsigned char *, unsigned char const *, size_t);
|
||||
static __inline__
|
||||
void __movsd(unsigned long *, unsigned long const *, size_t);
|
||||
static __inline__
|
||||
void __movsw(unsigned short *, unsigned short const *, size_t);
|
||||
static __inline__
|
||||
void __nop(void);
|
||||
void __nvreg_restore_fence(void);
|
||||
void __nvreg_save_fence(void);
|
||||
@@ -105,23 +96,16 @@ unsigned long __readcr4(void);
|
||||
unsigned long __readcr8(void);
|
||||
unsigned int __readdr(unsigned int);
|
||||
#ifdef __i386__
|
||||
static __inline__
|
||||
unsigned char __readfsbyte(unsigned long);
|
||||
static __inline__
|
||||
unsigned __int64 __readfsqword(unsigned long);
|
||||
static __inline__
|
||||
unsigned short __readfsword(unsigned long);
|
||||
#endif
|
||||
static __inline__
|
||||
unsigned __int64 __readmsr(unsigned long);
|
||||
unsigned __int64 __readpmc(unsigned long);
|
||||
unsigned long __segmentlimit(unsigned long);
|
||||
void __sidt(void *);
|
||||
static __inline__
|
||||
void __stosb(unsigned char *, unsigned char, size_t);
|
||||
static __inline__
|
||||
void __stosd(unsigned long *, unsigned long, size_t);
|
||||
static __inline__
|
||||
void __stosw(unsigned short *, unsigned short, size_t);
|
||||
void __svm_clgi(void);
|
||||
void __svm_invlpga(void *, int);
|
||||
@@ -136,7 +120,6 @@ void __vmx_off(void);
|
||||
void __vmx_vmptrst(unsigned __int64 *);
|
||||
void __wbinvd(void);
|
||||
void __writecr0(unsigned int);
|
||||
static __inline__
|
||||
void __writecr3(unsigned __INTPTR_TYPE__);
|
||||
void __writecr4(unsigned int);
|
||||
void __writecr8(unsigned int);
|
||||
@@ -146,11 +129,8 @@ void __writefsdword(unsigned long, unsigned long);
|
||||
void __writefsqword(unsigned long, unsigned __int64);
|
||||
void __writefsword(unsigned long, unsigned short);
|
||||
void __writemsr(unsigned long, unsigned __int64);
|
||||
static __inline__
|
||||
void *_AddressOfReturnAddress(void);
|
||||
static __inline__
|
||||
unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
|
||||
static __inline__
|
||||
unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
|
||||
unsigned char _bittest(long const *, long);
|
||||
unsigned char _bittestandcomplement(long *, long);
|
||||
@@ -169,12 +149,10 @@ long _InterlockedExchangeAdd_HLEAcquire(long volatile *, long);
|
||||
long _InterlockedExchangeAdd_HLERelease(long volatile *, long);
|
||||
__int64 _InterlockedExchangeAdd64_HLEAcquire(__int64 volatile *, __int64);
|
||||
__int64 _InterlockedExchangeAdd64_HLERelease(__int64 volatile *, __int64);
|
||||
static __inline__ void
|
||||
__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
|
||||
_ReadBarrier(void);
|
||||
static __inline__ void
|
||||
__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
|
||||
_ReadWriteBarrier(void);
|
||||
void __attribute__((__deprecated__(
|
||||
"use other intrinsics or C++11 atomics instead"))) _ReadBarrier(void);
|
||||
void __attribute__((__deprecated__(
|
||||
"use other intrinsics or C++11 atomics instead"))) _ReadWriteBarrier(void);
|
||||
unsigned int _rorx_u32(unsigned int, const unsigned int);
|
||||
int _sarx_i32(int, unsigned int);
|
||||
#if __STDC_HOSTED__
|
||||
@@ -185,9 +163,8 @@ unsigned int _shrx_u32(unsigned int, unsigned int);
|
||||
void _Store_HLERelease(long volatile *, long);
|
||||
void _Store64_HLERelease(__int64 volatile *, __int64);
|
||||
void _StorePointer_HLERelease(void *volatile *, void *);
|
||||
static __inline__ void
|
||||
__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
|
||||
_WriteBarrier(void);
|
||||
void __attribute__((__deprecated__(
|
||||
"use other intrinsics or C++11 atomics instead"))) _WriteBarrier(void);
|
||||
unsigned __int32 xbegin(void);
|
||||
void _xend(void);
|
||||
|
||||
@@ -197,19 +174,14 @@ void __addgsbyte(unsigned long, unsigned char);
|
||||
void __addgsdword(unsigned long, unsigned long);
|
||||
void __addgsqword(unsigned long, unsigned __int64);
|
||||
void __addgsword(unsigned long, unsigned short);
|
||||
static __inline__
|
||||
void __faststorefence(void);
|
||||
void __incgsbyte(unsigned long);
|
||||
void __incgsdword(unsigned long);
|
||||
void __incgsqword(unsigned long);
|
||||
void __incgsword(unsigned long);
|
||||
static __inline__
|
||||
void __movsq(unsigned long long *, unsigned long long const *, size_t);
|
||||
static __inline__
|
||||
unsigned char __readgsbyte(unsigned long);
|
||||
static __inline__
|
||||
unsigned long __readgsdword(unsigned long);
|
||||
static __inline__
|
||||
unsigned __int64 __readgsqword(unsigned long);
|
||||
unsigned short __readgsword(unsigned long);
|
||||
unsigned __int64 __shiftleft128(unsigned __int64 _LowPart,
|
||||
@@ -218,7 +190,6 @@ unsigned __int64 __shiftleft128(unsigned __int64 _LowPart,
|
||||
unsigned __int64 __shiftright128(unsigned __int64 _LowPart,
|
||||
unsigned __int64 _HighPart,
|
||||
unsigned char _Shift);
|
||||
static __inline__
|
||||
void __stosq(unsigned __int64 *, unsigned __int64, size_t);
|
||||
unsigned char __vmx_on(unsigned __int64 *);
|
||||
unsigned char __vmx_vmclear(unsigned __int64 *);
|
||||
@@ -243,10 +214,6 @@ unsigned char _interlockedbittestandreset64(__int64 volatile *, __int64);
|
||||
unsigned char _interlockedbittestandset64(__int64 volatile *, __int64);
|
||||
long _InterlockedCompareExchange_np(long volatile *_Destination, long _Exchange,
|
||||
long _Comparand);
|
||||
unsigned char _InterlockedCompareExchange128(__int64 volatile *_Destination,
|
||||
__int64 _ExchangeHigh,
|
||||
__int64 _ExchangeLow,
|
||||
__int64 *_CompareandResult);
|
||||
unsigned char _InterlockedCompareExchange128_np(__int64 volatile *_Destination,
|
||||
__int64 _ExchangeHigh,
|
||||
__int64 _ExchangeLow,
|
||||
@@ -269,13 +236,9 @@ unsigned __int64 _rorx_u64(unsigned __int64, const unsigned int);
|
||||
__int64 _sarx_i64(__int64, unsigned int);
|
||||
unsigned __int64 _shlx_u64(unsigned __int64, unsigned int);
|
||||
unsigned __int64 _shrx_u64(unsigned __int64, unsigned int);
|
||||
static __inline__
|
||||
__int64 __mulh(__int64, __int64);
|
||||
static __inline__
|
||||
unsigned __int64 __umulh(unsigned __int64, unsigned __int64);
|
||||
static __inline__
|
||||
__int64 _mul128(__int64, __int64, __int64*);
|
||||
static __inline__
|
||||
unsigned __int64 _umul128(unsigned __int64,
|
||||
unsigned __int64,
|
||||
unsigned __int64*);
|
||||
@@ -284,29 +247,19 @@ unsigned __int64 _umul128(unsigned __int64,
|
||||
|
||||
#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
|
||||
|
||||
static __inline__
|
||||
unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
|
||||
static __inline__
|
||||
unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__i386__) || defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
|
||||
static __inline__
|
||||
__int64 _InterlockedDecrement64(__int64 volatile *_Addend);
|
||||
static __inline__
|
||||
__int64 _InterlockedExchange64(__int64 volatile *_Target, __int64 _Value);
|
||||
static __inline__
|
||||
__int64 _InterlockedExchangeAdd64(__int64 volatile *_Addend, __int64 _Value);
|
||||
static __inline__
|
||||
__int64 _InterlockedExchangeSub64(__int64 volatile *_Subend, __int64 _Value);
|
||||
static __inline__
|
||||
__int64 _InterlockedIncrement64(__int64 volatile *_Addend);
|
||||
static __inline__
|
||||
__int64 _InterlockedOr64(__int64 volatile *_Value, __int64 _Mask);
|
||||
static __inline__
|
||||
__int64 _InterlockedXor64(__int64 volatile *_Value, __int64 _Mask);
|
||||
static __inline__
|
||||
__int64 _InterlockedAnd64(__int64 volatile *_Value, __int64 _Mask);
|
||||
|
||||
#endif
|
||||
@@ -470,45 +423,81 @@ __int64 _InterlockedCompareExchange64_nf(__int64 volatile *_Destination,
|
||||
__int64 _InterlockedCompareExchange64_rel(__int64 volatile *_Destination,
|
||||
__int64 _Exchange, __int64 _Comparand);
|
||||
#endif
|
||||
#if defined(__x86_64__) || defined(__aarch64__)
|
||||
unsigned char _InterlockedCompareExchange128(__int64 volatile *_Destination,
|
||||
__int64 _ExchangeHigh,
|
||||
__int64 _ExchangeLow,
|
||||
__int64 *_ComparandResult);
|
||||
#endif
|
||||
#if defined(__aarch64__)
|
||||
unsigned char _InterlockedCompareExchange128_acq(__int64 volatile *_Destination,
|
||||
__int64 _ExchangeHigh,
|
||||
__int64 _ExchangeLow,
|
||||
__int64 *_ComparandResult);
|
||||
unsigned char _InterlockedCompareExchange128_nf(__int64 volatile *_Destination,
|
||||
__int64 _ExchangeHigh,
|
||||
__int64 _ExchangeLow,
|
||||
__int64 *_ComparandResult);
|
||||
unsigned char _InterlockedCompareExchange128_rel(__int64 volatile *_Destination,
|
||||
__int64 _ExchangeHigh,
|
||||
__int64 _ExchangeLow,
|
||||
__int64 *_ComparandResult);
|
||||
#endif
|
||||
|
||||
/*----------------------------------------------------------------------------*\
|
||||
|* movs, stos
|
||||
\*----------------------------------------------------------------------------*/
|
||||
#if defined(__i386__) || defined(__x86_64__)
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
__movsb(unsigned char *__dst, unsigned char const *__src, size_t __n) {
|
||||
static __inline__ void __DEFAULT_FN_ATTRS __movsb(unsigned char *__dst,
|
||||
unsigned char const *__src,
|
||||
size_t __n) {
|
||||
__asm__ __volatile__("rep movsb" : "+D"(__dst), "+S"(__src), "+c"(__n)
|
||||
: : "memory");
|
||||
}
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
__movsd(unsigned long *__dst, unsigned long const *__src, size_t __n) {
|
||||
__asm__ __volatile__("rep movsl" : "+D"(__dst), "+S"(__src), "+c"(__n)
|
||||
: : "memory");
|
||||
}
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
__movsw(unsigned short *__dst, unsigned short const *__src, size_t __n) {
|
||||
__asm__ __volatile__("rep movsw" : "+D"(__dst), "+S"(__src), "+c"(__n)
|
||||
: : "memory");
|
||||
}
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
__stosd(unsigned long *__dst, unsigned long __x, size_t __n) {
|
||||
__asm__ __volatile__("rep stosl" : "+D"(__dst), "+c"(__n) : "a"(__x)
|
||||
static __inline__ void __DEFAULT_FN_ATTRS __movsd(unsigned long *__dst,
|
||||
unsigned long const *__src,
|
||||
size_t __n) {
|
||||
__asm__ __volatile__("rep movsl"
|
||||
: "+D"(__dst), "+S"(__src), "+c"(__n)
|
||||
:
|
||||
: "memory");
|
||||
}
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
__stosw(unsigned short *__dst, unsigned short __x, size_t __n) {
|
||||
__asm__ __volatile__("rep stosw" : "+D"(__dst), "+c"(__n) : "a"(__x)
|
||||
static __inline__ void __DEFAULT_FN_ATTRS __movsw(unsigned short *__dst,
|
||||
unsigned short const *__src,
|
||||
size_t __n) {
|
||||
__asm__ __volatile__("rep movsw"
|
||||
: "+D"(__dst), "+S"(__src), "+c"(__n)
|
||||
:
|
||||
: "memory");
|
||||
}
|
||||
static __inline__ void __DEFAULT_FN_ATTRS __stosd(unsigned long *__dst,
|
||||
unsigned long __x,
|
||||
size_t __n) {
|
||||
__asm__ __volatile__("rep stosl"
|
||||
: "+D"(__dst), "+c"(__n)
|
||||
: "a"(__x)
|
||||
: "memory");
|
||||
}
|
||||
static __inline__ void __DEFAULT_FN_ATTRS __stosw(unsigned short *__dst,
|
||||
unsigned short __x,
|
||||
size_t __n) {
|
||||
__asm__ __volatile__("rep stosw"
|
||||
: "+D"(__dst), "+c"(__n)
|
||||
: "a"(__x)
|
||||
: "memory");
|
||||
}
|
||||
#endif
|
||||
#ifdef __x86_64__
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
__movsq(unsigned long long *__dst, unsigned long long const *__src, size_t __n) {
|
||||
__asm__ __volatile__("rep movsq" : "+D"(__dst), "+S"(__src), "+c"(__n)
|
||||
: : "memory");
|
||||
static __inline__ void __DEFAULT_FN_ATTRS __movsq(
|
||||
unsigned long long *__dst, unsigned long long const *__src, size_t __n) {
|
||||
__asm__ __volatile__("rep movsq"
|
||||
: "+D"(__dst), "+S"(__src), "+c"(__n)
|
||||
:
|
||||
: "memory");
|
||||
}
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
__stosq(unsigned __int64 *__dst, unsigned __int64 __x, size_t __n) {
|
||||
static __inline__ void __DEFAULT_FN_ATTRS __stosq(unsigned __int64 *__dst,
|
||||
unsigned __int64 __x,
|
||||
size_t __n) {
|
||||
__asm__ __volatile__("rep stosq" : "+D"(__dst), "+c"(__n) : "a"(__x)
|
||||
: "memory");
|
||||
}
|
||||
@@ -518,26 +507,25 @@ __stosq(unsigned __int64 *__dst, unsigned __int64 __x, size_t __n) {
|
||||
|* Misc
|
||||
\*----------------------------------------------------------------------------*/
|
||||
#if defined(__i386__) || defined(__x86_64__)
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
__cpuid(int __info[4], int __level) {
|
||||
__asm__ ("cpuid" : "=a"(__info[0]), "=b" (__info[1]), "=c"(__info[2]), "=d"(__info[3])
|
||||
: "a"(__level), "c"(0));
|
||||
static __inline__ void __DEFAULT_FN_ATTRS __cpuid(int __info[4], int __level) {
|
||||
__asm__("cpuid"
|
||||
: "=a"(__info[0]), "=b"(__info[1]), "=c"(__info[2]), "=d"(__info[3])
|
||||
: "a"(__level), "c"(0));
|
||||
}
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
__cpuidex(int __info[4], int __level, int __ecx) {
|
||||
__asm__ ("cpuid" : "=a"(__info[0]), "=b" (__info[1]), "=c"(__info[2]), "=d"(__info[3])
|
||||
: "a"(__level), "c"(__ecx));
|
||||
static __inline__ void __DEFAULT_FN_ATTRS __cpuidex(int __info[4], int __level,
|
||||
int __ecx) {
|
||||
__asm__("cpuid"
|
||||
: "=a"(__info[0]), "=b"(__info[1]), "=c"(__info[2]), "=d"(__info[3])
|
||||
: "a"(__level), "c"(__ecx));
|
||||
}
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
__halt(void) {
|
||||
__asm__ volatile ("hlt");
|
||||
static __inline__ void __DEFAULT_FN_ATTRS __halt(void) {
|
||||
__asm__ volatile("hlt");
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__i386__) || defined(__x86_64__) || defined(__aarch64__)
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
__nop(void) {
|
||||
__asm__ volatile ("nop");
|
||||
static __inline__ void __DEFAULT_FN_ATTRS __nop(void) {
|
||||
__asm__ volatile("nop");
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -574,8 +562,7 @@ __readmsr(unsigned long __register) {
|
||||
}
|
||||
#endif
|
||||
|
||||
static __inline__ unsigned __LPTRINT_TYPE__ __DEFAULT_FN_ATTRS
|
||||
__readcr3(void) {
|
||||
static __inline__ unsigned __LPTRINT_TYPE__ __DEFAULT_FN_ATTRS __readcr3(void) {
|
||||
unsigned __LPTRINT_TYPE__ __cr3_val;
|
||||
__asm__ __volatile__ ("mov %%cr3, %0" : "=r"(__cr3_val) : : "memory");
|
||||
return __cr3_val;
|
||||
|
||||
Vendored
+506
@@ -0,0 +1,506 @@
|
||||
/*===----------------- keylockerintrin.h - KL Intrinsics -------------------===
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error "Never use <keylockerintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef _KEYLOCKERINTRIN_H
|
||||
#define _KEYLOCKERINTRIN_H
|
||||
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__KL__)
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("kl"),\
|
||||
__min_vector_width__(128)))
|
||||
|
||||
/// Load internal wrapping key from __intkey, __enkey_lo and __enkey_hi. __ctl
|
||||
/// will assigned to EAX, whch specifies the KeySource and whether backing up
|
||||
/// the key is permitted. The 256-bit encryption key is loaded from the two
|
||||
/// explicit operands (__enkey_lo and __enkey_hi). The 128-bit integrity key is
|
||||
/// loaded from the implicit operand XMM0 which assigned by __intkey.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> LOADIWKEY </c> instructions.
|
||||
///
|
||||
/// \operation
|
||||
/// IF CPL > 0 // LOADKWKEY only allowed at ring 0 (supervisor mode)
|
||||
/// GP (0)
|
||||
/// FI
|
||||
/// IF “LOADIWKEY exiting” VM execution control set
|
||||
/// VMexit
|
||||
/// FI
|
||||
/// IF __ctl[4:1] > 1 // Reserved KeySource encoding used
|
||||
/// GP (0)
|
||||
/// FI
|
||||
/// IF __ctl[31:5] != 0 // Reserved bit in __ctl is set
|
||||
/// GP (0)
|
||||
/// FI
|
||||
/// IF __ctl[0] AND (CPUID.19H.ECX[0] == 0) // NoBackup is not supported on this part
|
||||
/// GP (0)
|
||||
/// FI
|
||||
/// IF (__ctl[4:1] == 1) AND (CPUID.19H.ECX[1] == 0) // KeySource of 1 is not supported on this part
|
||||
/// GP (0)
|
||||
/// FI
|
||||
/// IF (__ctl[4:1] == 0) // KeySource of 0.
|
||||
/// IWKey.Encryption Key[127:0] := __enkey_hi[127:0]:
|
||||
/// IWKey.Encryption Key[255:128] := __enkey_lo[127:0]
|
||||
/// IWKey.IntegrityKey[127:0] := __intkey[127:0]
|
||||
/// IWKey.NoBackup := __ctl[0]
|
||||
/// IWKey.KeySource := __ctl[4:1]
|
||||
/// ZF := 0
|
||||
/// ELSE // KeySource of 1. See RDSEED definition for details of randomness
|
||||
/// IF HW_NRND_GEN.ready == 1 // Full-entropy random data from RDSEED was received
|
||||
/// IWKey.Encryption Key[127:0] := __enkey_hi[127:0] XOR HW_NRND_GEN.data[127:0]
|
||||
/// IWKey.Encryption Key[255:128] := __enkey_lo[127:0] XOR HW_NRND_GEN.data[255:128]
|
||||
/// IWKey.Encryption Key[255:0] := __enkey_hi[127:0]:__enkey_lo[127:0] XOR HW_NRND_GEN.data[255:0]
|
||||
/// IWKey.IntegrityKey[127:0] := __intkey[127:0] XOR HW_NRND_GEN.data[383:256]
|
||||
/// IWKey.NoBackup := __ctl[0]
|
||||
/// IWKey.KeySource := __ctl[4:1]
|
||||
/// ZF := 0
|
||||
/// ELSE // Random data was not returned from RDSEED. IWKey was not loaded
|
||||
/// ZF := 1
|
||||
/// FI
|
||||
/// FI
|
||||
/// dst := ZF
|
||||
/// OF := 0
|
||||
/// SF := 0
|
||||
/// AF := 0
|
||||
/// PF := 0
|
||||
/// CF := 0
|
||||
/// \endoperation
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_mm_loadiwkey (unsigned int __ctl, __m128i __intkey,
|
||||
__m128i __enkey_lo, __m128i __enkey_hi) {
|
||||
__builtin_ia32_loadiwkey (__intkey, __enkey_lo, __enkey_hi, __ctl);
|
||||
}
|
||||
|
||||
/// Wrap a 128-bit AES key from __key into a key handle and output in
|
||||
/// ((__m128i*)__h) to ((__m128i*)__h) + 5 and a 32-bit value as return.
|
||||
/// The explicit source operand __htype specifies handle restrictions.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> ENCODEKEY128 </c> instructions.
|
||||
///
|
||||
/// \operation
|
||||
/// InputKey[127:0] := __key[127:0]
|
||||
/// KeyMetadata[2:0] := __htype[2:0]
|
||||
/// KeyMetadata[23:3] := 0 // Reserved for future usage
|
||||
/// KeyMetadata[27:24] := 0 // KeyType is AES-128 (value of 0)
|
||||
/// KeyMetadata[127:28] := 0 // Reserved for future usage
|
||||
/// Handle[383:0] := WrapKey128(InputKey[127:0], KeyMetadata[127:0],
|
||||
/// IWKey.Integrity Key[127:0], IWKey.Encryption Key[255:0])
|
||||
/// dst[0] := IWKey.NoBackup
|
||||
/// dst[4:1] := IWKey.KeySource[3:0]
|
||||
/// dst[31:5] := 0
|
||||
/// MEM[__h+127:__h] := Handle[127:0] // AAD
|
||||
/// MEM[__h+255:__h+128] := Handle[255:128] // Integrity Tag
|
||||
/// MEM[__h+383:__h+256] := Handle[383:256] // CipherText
|
||||
/// MEM[__h+511:__h+384] := 0 // Reserved for future usage
|
||||
/// MEM[__h+639:__h+512] := 0 // Reserved for future usage
|
||||
/// MEM[__h+767:__h+640] := 0 // Reserved for future usage
|
||||
/// OF := 0
|
||||
/// SF := 0
|
||||
/// ZF := 0
|
||||
/// AF := 0
|
||||
/// PF := 0
|
||||
/// CF := 0
|
||||
/// \endoperation
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
_mm_encodekey128_u32(unsigned int __htype, __m128i __key, void *__h) {
|
||||
return __builtin_ia32_encodekey128_u32(__htype, (__v2di)__key, __h);
|
||||
}
|
||||
|
||||
/// Wrap a 256-bit AES key from __key_hi:__key_lo into a key handle, then
|
||||
/// output handle in ((__m128i*)__h) to ((__m128i*)__h) + 6 and
|
||||
/// a 32-bit value as return.
|
||||
/// The explicit source operand __htype specifies handle restrictions.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> ENCODEKEY256 </c> instructions.
|
||||
///
|
||||
/// \operation
|
||||
/// InputKey[127:0] := __key_lo[127:0]
|
||||
/// InputKey[255:128] := __key_hi[255:128]
|
||||
/// KeyMetadata[2:0] := __htype[2:0]
|
||||
/// KeyMetadata[23:3] := 0 // Reserved for future usage
|
||||
/// KeyMetadata[27:24] := 1 // KeyType is AES-256 (value of 1)
|
||||
/// KeyMetadata[127:28] := 0 // Reserved for future usage
|
||||
/// Handle[511:0] := WrapKey256(InputKey[255:0], KeyMetadata[127:0],
|
||||
/// IWKey.Integrity Key[127:0], IWKey.Encryption Key[255:0])
|
||||
/// dst[0] := IWKey.NoBackup
|
||||
/// dst[4:1] := IWKey.KeySource[3:0]
|
||||
/// dst[31:5] := 0
|
||||
/// MEM[__h+127:__h] := Handle[127:0] // AAD
|
||||
/// MEM[__h+255:__h+128] := Handle[255:128] // Tag
|
||||
/// MEM[__h+383:__h+256] := Handle[383:256] // CipherText[127:0]
|
||||
/// MEM[__h+511:__h+384] := Handle[511:384] // CipherText[255:128]
|
||||
/// MEM[__h+639:__h+512] := 0 // Reserved for future usage
|
||||
/// MEM[__h+767:__h+640] := 0 // Reserved for future usage
|
||||
/// MEM[__h+895:__h+768] := 0 Integrity// Reserved for future usage
|
||||
/// OF := 0
|
||||
/// SF := 0
|
||||
/// ZF := 0
|
||||
/// AF := 0
|
||||
/// PF := 0
|
||||
/// CF := 0
|
||||
/// \endoperation
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
_mm_encodekey256_u32(unsigned int __htype, __m128i __key_lo, __m128i __key_hi,
|
||||
void *__h) {
|
||||
return __builtin_ia32_encodekey256_u32(__htype, (__v2di)__key_lo,
|
||||
(__v2di)__key_hi, __h);
|
||||
}
|
||||
|
||||
/// The AESENC128KL performs 10 rounds of AES to encrypt the __idata using
|
||||
/// the 128-bit key in the handle from the __h. It stores the result in the
|
||||
/// __odata. And return the affected ZF flag status.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> AESENC128KL </c> instructions.
|
||||
///
|
||||
/// \operation
|
||||
/// Handle[383:0] := MEM[__h+383:__h] // Load is not guaranteed to be atomic.
|
||||
/// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) ||
|
||||
/// (Handle[127:0] AND (CPL > 0)) ||
|
||||
/// Handle[383:256] ||
|
||||
/// HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128 )
|
||||
/// IF (IllegalHandle)
|
||||
/// ZF := 1
|
||||
/// ELSE
|
||||
/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
|
||||
/// IF (Authentic == 0)
|
||||
/// ZF := 1
|
||||
/// ELSE
|
||||
/// MEM[__odata+127:__odata] := AES128Encrypt (__idata[127:0], UnwrappedKey)
|
||||
/// ZF := 0
|
||||
/// FI
|
||||
/// FI
|
||||
/// dst := ZF
|
||||
/// OF := 0
|
||||
/// SF := 0
|
||||
/// AF := 0
|
||||
/// PF := 0
|
||||
/// CF := 0
|
||||
/// \endoperation
|
||||
static __inline__ unsigned char __DEFAULT_FN_ATTRS
|
||||
_mm_aesenc128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
|
||||
return __builtin_ia32_aesenc128kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
|
||||
}
|
||||
|
||||
/// The AESENC256KL performs 14 rounds of AES to encrypt the __idata using
|
||||
/// the 256-bit key in the handle from the __h. It stores the result in the
|
||||
/// __odata. And return the affected ZF flag status.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> AESENC256KL </c> instructions.
|
||||
///
|
||||
/// \operation
|
||||
/// Handle[511:0] := MEM[__h+511:__h] // Load is not guaranteed to be atomic.
|
||||
/// IllegalHandle := ( HandleReservedBitSet (Handle[511:0]) ||
|
||||
/// (Handle[127:0] AND (CPL > 0)) ||
|
||||
/// Handle[255:128] ||
|
||||
/// HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES256 )
|
||||
/// IF (IllegalHandle)
|
||||
/// ZF := 1
|
||||
/// ELSE
|
||||
/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
|
||||
/// IF (Authentic == 0)
|
||||
/// ZF := 1
|
||||
/// ELSE
|
||||
/// MEM[__odata+127:__odata] := AES256Encrypt (__idata[127:0], UnwrappedKey)
|
||||
/// ZF := 0
|
||||
/// FI
|
||||
/// FI
|
||||
/// dst := ZF
|
||||
/// OF := 0
|
||||
/// SF := 0
|
||||
/// AF := 0
|
||||
/// PF := 0
|
||||
/// CF := 0
|
||||
/// \endoperation
|
||||
static __inline__ unsigned char __DEFAULT_FN_ATTRS
|
||||
_mm_aesenc256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
|
||||
return __builtin_ia32_aesenc256kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
|
||||
}
|
||||
|
||||
/// The AESDEC128KL performs 10 rounds of AES to decrypt the __idata using
|
||||
/// the 128-bit key in the handle from the __h. It stores the result in the
|
||||
/// __odata. And return the affected ZF flag status.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> AESDEC128KL </c> instructions.
|
||||
///
|
||||
/// \operation
|
||||
/// Handle[383:0] := MEM[__h+383:__h] // Load is not guaranteed to be atomic.
|
||||
/// IllegalHandle := (HandleReservedBitSet (Handle[383:0]) ||
|
||||
/// (Handle[127:0] AND (CPL > 0)) ||
|
||||
/// Handle[383:256] ||
|
||||
/// HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128)
|
||||
/// IF (IllegalHandle)
|
||||
/// ZF := 1
|
||||
/// ELSE
|
||||
/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
|
||||
/// IF (Authentic == 0)
|
||||
/// ZF := 1
|
||||
/// ELSE
|
||||
/// MEM[__odata+127:__odata] := AES128Decrypt (__idata[127:0], UnwrappedKey)
|
||||
/// ZF := 0
|
||||
/// FI
|
||||
/// FI
|
||||
/// dst := ZF
|
||||
/// OF := 0
|
||||
/// SF := 0
|
||||
/// AF := 0
|
||||
/// PF := 0
|
||||
/// CF := 0
|
||||
/// \endoperation
|
||||
static __inline__ unsigned char __DEFAULT_FN_ATTRS
|
||||
_mm_aesdec128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
|
||||
return __builtin_ia32_aesdec128kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
|
||||
}
|
||||
|
||||
/// The AESDEC256KL performs 10 rounds of AES to decrypt the __idata using
|
||||
/// the 256-bit key in the handle from the __h. It stores the result in the
|
||||
/// __odata. And return the affected ZF flag status.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> AESDEC256KL </c> instructions.
|
||||
///
|
||||
/// \operation
|
||||
/// Handle[511:0] := MEM[__h+511:__h]
|
||||
/// IllegalHandle := (HandleReservedBitSet (Handle[511:0]) ||
|
||||
/// (Handle[127:0] AND (CPL > 0)) ||
|
||||
/// Handle[383:256] ||
|
||||
/// HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES256)
|
||||
/// IF (IllegalHandle)
|
||||
/// ZF := 1
|
||||
/// ELSE
|
||||
/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
|
||||
/// IF (Authentic == 0)
|
||||
/// ZF := 1
|
||||
/// ELSE
|
||||
/// MEM[__odata+127:__odata] := AES256Decrypt (__idata[127:0], UnwrappedKey)
|
||||
/// ZF := 0
|
||||
/// FI
|
||||
/// FI
|
||||
/// dst := ZF
|
||||
/// OF := 0
|
||||
/// SF := 0
|
||||
/// AF := 0
|
||||
/// PF := 0
|
||||
/// CF := 0
|
||||
/// \endoperation
|
||||
static __inline__ unsigned char __DEFAULT_FN_ATTRS
|
||||
_mm_aesdec256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
|
||||
return __builtin_ia32_aesdec256kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
|
||||
}
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif /* !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) \
|
||||
|| defined(__KL__) */
|
||||
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__WIDEKL__)
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("kl,widekl"),\
|
||||
__min_vector_width__(128)))
|
||||
|
||||
/// Encrypt __idata[0] to __idata[7] using 128-bit AES key indicated by handle
|
||||
/// at __h and store each resultant block back from __odata to __odata+7. And
|
||||
/// return the affected ZF flag status.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> AESENCWIDE128KL </c> instructions.
|
||||
///
|
||||
/// \operation
|
||||
/// Handle := MEM[__h+383:__h]
|
||||
/// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) ||
|
||||
/// (Handle[127:0] AND (CPL > 0)) ||
|
||||
/// Handle[255:128] ||
|
||||
/// HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128 )
|
||||
/// IF (IllegalHandle)
|
||||
/// ZF := 1
|
||||
/// ELSE
|
||||
/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
|
||||
/// IF Authentic == 0
|
||||
/// ZF := 1
|
||||
/// ELSE
|
||||
/// FOR i := 0 to 7
|
||||
/// __odata[i] := AES128Encrypt (__idata[i], UnwrappedKey)
|
||||
/// ENDFOR
|
||||
/// ZF := 0
|
||||
/// FI
|
||||
/// FI
|
||||
/// dst := ZF
|
||||
/// OF := 0
|
||||
/// SF := 0
|
||||
/// AF := 0
|
||||
/// PF := 0
|
||||
/// CF := 0
|
||||
/// \endoperation
|
||||
static __inline__ unsigned char __DEFAULT_FN_ATTRS
|
||||
_mm_aesencwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
|
||||
return __builtin_ia32_aesencwide128kl_u8((__v2di *)__odata,
|
||||
(const __v2di *)__idata, __h);
|
||||
}
|
||||
|
||||
/// Encrypt __idata[0] to __idata[7] using 256-bit AES key indicated by handle
|
||||
/// at __h and store each resultant block back from __odata to __odata+7. And
|
||||
/// return the affected ZF flag status.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> AESENCWIDE256KL </c> instructions.
|
||||
///
|
||||
/// \operation
|
||||
/// Handle[511:0] := MEM[__h+511:__h]
|
||||
/// IllegalHandle := ( HandleReservedBitSet (Handle[511:0]) ||
|
||||
/// (Handle[127:0] AND (CPL > 0)) ||
|
||||
/// Handle[255:128] ||
|
||||
/// HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES512 )
|
||||
/// IF (IllegalHandle)
|
||||
/// ZF := 1
|
||||
/// ELSE
|
||||
/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
|
||||
/// IF Authentic == 0
|
||||
/// ZF := 1
|
||||
/// ELSE
|
||||
/// FOR i := 0 to 7
|
||||
/// __odata[i] := AES256Encrypt (__idata[i], UnwrappedKey)
|
||||
/// ENDFOR
|
||||
/// ZF := 0
|
||||
/// FI
|
||||
/// FI
|
||||
/// dst := ZF
|
||||
/// OF := 0
|
||||
/// SF := 0
|
||||
/// AF := 0
|
||||
/// PF := 0
|
||||
/// CF := 0
|
||||
/// \endoperation
|
||||
static __inline__ unsigned char __DEFAULT_FN_ATTRS
|
||||
_mm_aesencwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
|
||||
return __builtin_ia32_aesencwide256kl_u8((__v2di *)__odata,
|
||||
(const __v2di *)__idata, __h);
|
||||
}
|
||||
|
||||
/// Decrypt __idata[0] to __idata[7] using 128-bit AES key indicated by handle
|
||||
/// at __h and store each resultant block back from __odata to __odata+7. And
|
||||
/// return the affected ZF flag status.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> AESDECWIDE128KL </c> instructions.
|
||||
///
|
||||
/// \operation
|
||||
/// Handle[383:0] := MEM[__h+383:__h]
|
||||
/// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) ||
|
||||
/// (Handle[127:0] AND (CPL > 0)) ||
|
||||
/// Handle[255:128] ||
|
||||
/// HandleKeyType (Handle) != HANDLE_KEY_TYPE_AES128 )
|
||||
/// IF (IllegalHandle)
|
||||
/// ZF := 1
|
||||
/// ELSE
|
||||
/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
|
||||
/// IF Authentic == 0
|
||||
/// ZF := 1
|
||||
/// ELSE
|
||||
/// FOR i := 0 to 7
|
||||
/// __odata[i] := AES128Decrypt (__idata[i], UnwrappedKey)
|
||||
/// ENDFOR
|
||||
/// ZF := 0
|
||||
/// FI
|
||||
/// FI
|
||||
/// dst := ZF
|
||||
/// OF := 0
|
||||
/// SF := 0
|
||||
/// AF := 0
|
||||
/// PF := 0
|
||||
/// CF := 0
|
||||
/// \endoperation
|
||||
static __inline__ unsigned char __DEFAULT_FN_ATTRS
|
||||
_mm_aesdecwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
|
||||
return __builtin_ia32_aesdecwide128kl_u8((__v2di *)__odata,
|
||||
(const __v2di *)__idata, __h);
|
||||
}
|
||||
|
||||
/// Decrypt __idata[0] to __idata[7] using 256-bit AES key indicated by handle
|
||||
/// at __h and store each resultant block back from __odata to __odata+7. And
|
||||
/// return the affected ZF flag status.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> AESDECWIDE256KL </c> instructions.
|
||||
///
|
||||
/// \operation
|
||||
/// Handle[511:0] := MEM[__h+511:__h]
|
||||
/// IllegalHandle = ( HandleReservedBitSet (Handle[511:0]) ||
|
||||
/// (Handle[127:0] AND (CPL > 0)) ||
|
||||
/// Handle[255:128] ||
|
||||
/// HandleKeyType (Handle) != HANDLE_KEY_TYPE_AES512 )
|
||||
/// If (IllegalHandle)
|
||||
/// ZF := 1
|
||||
/// ELSE
|
||||
/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
|
||||
/// IF Authentic == 0
|
||||
/// ZF := 1
|
||||
/// ELSE
|
||||
/// FOR i := 0 to 7
|
||||
/// __odata[i] := AES256Decrypt (__idata[i], UnwrappedKey)
|
||||
/// ENDFOR
|
||||
/// ZF := 0
|
||||
/// FI
|
||||
/// FI
|
||||
/// dst := ZF
|
||||
/// OF := 0
|
||||
/// SF := 0
|
||||
/// AF := 0
|
||||
/// PF := 0
|
||||
/// CF := 0
|
||||
/// \endoperation
|
||||
static __inline__ unsigned char __DEFAULT_FN_ATTRS
|
||||
_mm_aesdecwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
|
||||
return __builtin_ia32_aesdecwide256kl_u8((__v2di *)__odata,
|
||||
(const __v2di *)__idata, __h);
|
||||
}
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif /* !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) \
|
||||
|| defined(__WIDEKL__) */
|
||||
|
||||
#endif /* _KEYLOCKERINTRIN_H */
|
||||
Vendored
+6
@@ -54,7 +54,13 @@ _mm_malloc(size_t __size, size_t __align)
|
||||
static __inline__ void __attribute__((__always_inline__, __nodebug__))
|
||||
_mm_free(void *__p)
|
||||
{
|
||||
#if defined(__MINGW32__)
|
||||
__mingw_aligned_free(__p);
|
||||
#elif defined(_WIN32)
|
||||
_aligned_free(__p);
|
||||
#else
|
||||
free(__p);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
Vendored
+18
@@ -9,6 +9,21 @@
|
||||
#ifndef _OPENCL_BASE_H_
|
||||
#define _OPENCL_BASE_H_
|
||||
|
||||
// Define extension macros
|
||||
|
||||
#if (defined(__OPENCL_CPP_VERSION__) || __OPENCL_C_VERSION__ >= 200)
|
||||
// For SPIR all extensions are supported.
|
||||
#if defined(__SPIR__)
|
||||
#define cl_khr_subgroup_extended_types 1
|
||||
#define cl_khr_subgroup_non_uniform_vote 1
|
||||
#define cl_khr_subgroup_ballot 1
|
||||
#define cl_khr_subgroup_non_uniform_arithmetic 1
|
||||
#define cl_khr_subgroup_shuffle 1
|
||||
#define cl_khr_subgroup_shuffle_relative 1
|
||||
#define cl_khr_subgroup_clustered_reduce 1
|
||||
#endif // defined(__SPIR__)
|
||||
#endif // (defined(__OPENCL_CPP_VERSION__) || __OPENCL_C_VERSION__ >= 200)
|
||||
|
||||
// built-in scalar data types:
|
||||
|
||||
/**
|
||||
@@ -568,4 +583,7 @@ typedef struct {
|
||||
#pragma OPENCL EXTENSION cl_intel_device_side_avc_motion_estimation : end
|
||||
#endif // cl_intel_device_side_avc_motion_estimation
|
||||
|
||||
// Disable any extensions we may have enabled previously.
|
||||
#pragma OPENCL EXTENSION all : disable
|
||||
|
||||
#endif //_OPENCL_BASE_H_
|
||||
|
||||
Vendored
+2
@@ -4633,6 +4633,7 @@ float16 __ovld __cnfn convert_float16(float16);
|
||||
// Conversions with double data type parameters or return value.
|
||||
|
||||
#ifdef cl_khr_fp64
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
|
||||
char __ovld __cnfn convert_char(double);
|
||||
char __ovld __cnfn convert_char_rte(double);
|
||||
char __ovld __cnfn convert_char_rtn(double);
|
||||
@@ -5455,6 +5456,7 @@ double16 __ovld __cnfn convert_double16_rtz(ushort16);
|
||||
#endif //cl_khr_fp64
|
||||
|
||||
#ifdef cl_khr_fp16
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
// Convert half types to non-double types.
|
||||
uchar __ovld __cnfn convert_uchar(half);
|
||||
uchar __ovld __cnfn convert_uchar_rte(half);
|
||||
|
||||
@@ -24,8 +24,11 @@
|
||||
// which might live in cstdlib.
|
||||
#include <cstdlib>
|
||||
|
||||
// We need limits because __clang_cuda_cmath.h below uses `std::numeric_limit`.
|
||||
#include <limits>
|
||||
|
||||
#pragma omp begin declare variant match( \
|
||||
device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
|
||||
device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any, allow_templates)})
|
||||
|
||||
#define __CUDA__
|
||||
#define __OPENMP_NVPTX__
|
||||
|
||||
@@ -25,3 +25,28 @@
|
||||
|
||||
// Grab the host header too.
|
||||
#include_next <complex>
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
// If we are compiling against libc++, the macro _LIBCPP_STD_VER should be set
|
||||
// after including <cmath> above. Since the complex header we use is a
|
||||
// simplified version of the libc++, we don't need it in this case. If we
|
||||
// compile against libstdc++, or any other standard library, we will overload
|
||||
// the (hopefully template) functions in the <complex> header with the ones we
|
||||
// got from libc++ which decomposes math functions, like `std::sin`, into
|
||||
// arithmetic and calls to non-complex functions, all of which we can then
|
||||
// handle.
|
||||
#ifndef _LIBCPP_STD_VER
|
||||
|
||||
#pragma omp begin declare variant match( \
|
||||
device = {arch(nvptx, nvptx64)}, \
|
||||
implementation = {extension(match_any, allow_templates)})
|
||||
|
||||
#include <complex_cmath.h>
|
||||
|
||||
#pragma omp end declare variant
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -0,0 +1,388 @@
|
||||
//===------------------------- __complex_cmath.h --------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// std::complex header copied from the libcxx source and simplified for use in
|
||||
// OpenMP target offload regions.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef _OPENMP
|
||||
#error "This file is for OpenMP compilation only."
|
||||
#endif
|
||||
|
||||
#ifndef __cplusplus
|
||||
#error "This file is for C++ compilation only."
|
||||
#endif
|
||||
|
||||
#ifndef _LIBCPP_COMPLEX
|
||||
#define _LIBCPP_COMPLEX
|
||||
|
||||
#include <cmath>
|
||||
#include <type_traits>
|
||||
|
||||
#define __DEVICE__ static constexpr __attribute__((nothrow))
|
||||
|
||||
namespace std {
|
||||
|
||||
// abs
|
||||
|
||||
template <class _Tp> __DEVICE__ _Tp abs(const std::complex<_Tp> &__c) {
|
||||
return hypot(__c.real(), __c.imag());
|
||||
}
|
||||
|
||||
// arg
|
||||
|
||||
template <class _Tp> __DEVICE__ _Tp arg(const std::complex<_Tp> &__c) {
|
||||
return atan2(__c.imag(), __c.real());
|
||||
}
|
||||
|
||||
template <class _Tp>
|
||||
typename enable_if<is_integral<_Tp>::value || is_same<_Tp, double>::value,
|
||||
double>::type
|
||||
arg(_Tp __re) {
|
||||
return atan2(0., __re);
|
||||
}
|
||||
|
||||
template <class _Tp>
|
||||
typename enable_if<is_same<_Tp, float>::value, float>::type arg(_Tp __re) {
|
||||
return atan2f(0.F, __re);
|
||||
}
|
||||
|
||||
// norm
|
||||
|
||||
template <class _Tp> __DEVICE__ _Tp norm(const std::complex<_Tp> &__c) {
|
||||
if (std::isinf(__c.real()))
|
||||
return abs(__c.real());
|
||||
if (std::isinf(__c.imag()))
|
||||
return abs(__c.imag());
|
||||
return __c.real() * __c.real() + __c.imag() * __c.imag();
|
||||
}
|
||||
|
||||
// conj
|
||||
|
||||
template <class _Tp> std::complex<_Tp> conj(const std::complex<_Tp> &__c) {
|
||||
return std::complex<_Tp>(__c.real(), -__c.imag());
|
||||
}
|
||||
|
||||
// proj
|
||||
|
||||
template <class _Tp> std::complex<_Tp> proj(const std::complex<_Tp> &__c) {
|
||||
std::complex<_Tp> __r = __c;
|
||||
if (std::isinf(__c.real()) || std::isinf(__c.imag()))
|
||||
__r = std::complex<_Tp>(INFINITY, copysign(_Tp(0), __c.imag()));
|
||||
return __r;
|
||||
}
|
||||
|
||||
// polar
|
||||
|
||||
template <class _Tp>
|
||||
complex<_Tp> polar(const _Tp &__rho, const _Tp &__theta = _Tp()) {
|
||||
if (std::isnan(__rho) || signbit(__rho))
|
||||
return std::complex<_Tp>(_Tp(NAN), _Tp(NAN));
|
||||
if (std::isnan(__theta)) {
|
||||
if (std::isinf(__rho))
|
||||
return std::complex<_Tp>(__rho, __theta);
|
||||
return std::complex<_Tp>(__theta, __theta);
|
||||
}
|
||||
if (std::isinf(__theta)) {
|
||||
if (std::isinf(__rho))
|
||||
return std::complex<_Tp>(__rho, _Tp(NAN));
|
||||
return std::complex<_Tp>(_Tp(NAN), _Tp(NAN));
|
||||
}
|
||||
_Tp __x = __rho * cos(__theta);
|
||||
if (std::isnan(__x))
|
||||
__x = 0;
|
||||
_Tp __y = __rho * sin(__theta);
|
||||
if (std::isnan(__y))
|
||||
__y = 0;
|
||||
return std::complex<_Tp>(__x, __y);
|
||||
}
|
||||
|
||||
// log
|
||||
|
||||
template <class _Tp> std::complex<_Tp> log(const std::complex<_Tp> &__x) {
|
||||
return std::complex<_Tp>(log(abs(__x)), arg(__x));
|
||||
}
|
||||
|
||||
// log10
|
||||
|
||||
template <class _Tp> std::complex<_Tp> log10(const std::complex<_Tp> &__x) {
|
||||
return log(__x) / log(_Tp(10));
|
||||
}
|
||||
|
||||
// sqrt
|
||||
|
||||
template <class _Tp>
|
||||
__DEVICE__ std::complex<_Tp> sqrt(const std::complex<_Tp> &__x) {
|
||||
if (std::isinf(__x.imag()))
|
||||
return std::complex<_Tp>(_Tp(INFINITY), __x.imag());
|
||||
if (std::isinf(__x.real())) {
|
||||
if (__x.real() > _Tp(0))
|
||||
return std::complex<_Tp>(__x.real(), std::isnan(__x.imag())
|
||||
? __x.imag()
|
||||
: copysign(_Tp(0), __x.imag()));
|
||||
return std::complex<_Tp>(std::isnan(__x.imag()) ? __x.imag() : _Tp(0),
|
||||
copysign(__x.real(), __x.imag()));
|
||||
}
|
||||
return polar(sqrt(abs(__x)), arg(__x) / _Tp(2));
|
||||
}
|
||||
|
||||
// exp
|
||||
|
||||
template <class _Tp>
|
||||
__DEVICE__ std::complex<_Tp> exp(const std::complex<_Tp> &__x) {
|
||||
_Tp __i = __x.imag();
|
||||
if (std::isinf(__x.real())) {
|
||||
if (__x.real() < _Tp(0)) {
|
||||
if (!std::isfinite(__i))
|
||||
__i = _Tp(1);
|
||||
} else if (__i == 0 || !std::isfinite(__i)) {
|
||||
if (std::isinf(__i))
|
||||
__i = _Tp(NAN);
|
||||
return std::complex<_Tp>(__x.real(), __i);
|
||||
}
|
||||
} else if (std::isnan(__x.real()) && __x.imag() == 0)
|
||||
return __x;
|
||||
_Tp __e = exp(__x.real());
|
||||
return std::complex<_Tp>(__e * cos(__i), __e * sin(__i));
|
||||
}
|
||||
|
||||
// pow
|
||||
|
||||
template <class _Tp>
|
||||
std::complex<_Tp> pow(const std::complex<_Tp> &__x,
|
||||
const std::complex<_Tp> &__y) {
|
||||
return exp(__y * log(__x));
|
||||
}
|
||||
|
||||
// __sqr, computes pow(x, 2)
|
||||
|
||||
template <class _Tp> std::complex<_Tp> __sqr(const std::complex<_Tp> &__x) {
|
||||
return std::complex<_Tp>((__x.real() - __x.imag()) *
|
||||
(__x.real() + __x.imag()),
|
||||
_Tp(2) * __x.real() * __x.imag());
|
||||
}
|
||||
|
||||
// asinh
|
||||
|
||||
template <class _Tp>
|
||||
__DEVICE__ std::complex<_Tp> asinh(const std::complex<_Tp> &__x) {
|
||||
const _Tp __pi(atan2(+0., -0.));
|
||||
if (std::isinf(__x.real())) {
|
||||
if (std::isnan(__x.imag()))
|
||||
return __x;
|
||||
if (std::isinf(__x.imag()))
|
||||
return std::complex<_Tp>(__x.real(),
|
||||
copysign(__pi * _Tp(0.25), __x.imag()));
|
||||
return std::complex<_Tp>(__x.real(), copysign(_Tp(0), __x.imag()));
|
||||
}
|
||||
if (std::isnan(__x.real())) {
|
||||
if (std::isinf(__x.imag()))
|
||||
return std::complex<_Tp>(__x.imag(), __x.real());
|
||||
if (__x.imag() == 0)
|
||||
return __x;
|
||||
return std::complex<_Tp>(__x.real(), __x.real());
|
||||
}
|
||||
if (std::isinf(__x.imag()))
|
||||
return std::complex<_Tp>(copysign(__x.imag(), __x.real()),
|
||||
copysign(__pi / _Tp(2), __x.imag()));
|
||||
std::complex<_Tp> __z = log(__x + sqrt(__sqr(__x) + _Tp(1)));
|
||||
return std::complex<_Tp>(copysign(__z.real(), __x.real()),
|
||||
copysign(__z.imag(), __x.imag()));
|
||||
}
|
||||
|
||||
// acosh
|
||||
|
||||
template <class _Tp>
|
||||
__DEVICE__ std::complex<_Tp> acosh(const std::complex<_Tp> &__x) {
|
||||
const _Tp __pi(atan2(+0., -0.));
|
||||
if (std::isinf(__x.real())) {
|
||||
if (std::isnan(__x.imag()))
|
||||
return std::complex<_Tp>(abs(__x.real()), __x.imag());
|
||||
if (std::isinf(__x.imag())) {
|
||||
if (__x.real() > 0)
|
||||
return std::complex<_Tp>(__x.real(),
|
||||
copysign(__pi * _Tp(0.25), __x.imag()));
|
||||
else
|
||||
return std::complex<_Tp>(-__x.real(),
|
||||
copysign(__pi * _Tp(0.75), __x.imag()));
|
||||
}
|
||||
if (__x.real() < 0)
|
||||
return std::complex<_Tp>(-__x.real(), copysign(__pi, __x.imag()));
|
||||
return std::complex<_Tp>(__x.real(), copysign(_Tp(0), __x.imag()));
|
||||
}
|
||||
if (std::isnan(__x.real())) {
|
||||
if (std::isinf(__x.imag()))
|
||||
return std::complex<_Tp>(abs(__x.imag()), __x.real());
|
||||
return std::complex<_Tp>(__x.real(), __x.real());
|
||||
}
|
||||
if (std::isinf(__x.imag()))
|
||||
return std::complex<_Tp>(abs(__x.imag()),
|
||||
copysign(__pi / _Tp(2), __x.imag()));
|
||||
std::complex<_Tp> __z = log(__x + sqrt(__sqr(__x) - _Tp(1)));
|
||||
return std::complex<_Tp>(copysign(__z.real(), _Tp(0)),
|
||||
copysign(__z.imag(), __x.imag()));
|
||||
}
|
||||
|
||||
// atanh
|
||||
|
||||
template <class _Tp>
|
||||
__DEVICE__ std::complex<_Tp> atanh(const std::complex<_Tp> &__x) {
|
||||
const _Tp __pi(atan2(+0., -0.));
|
||||
if (std::isinf(__x.imag())) {
|
||||
return std::complex<_Tp>(copysign(_Tp(0), __x.real()),
|
||||
copysign(__pi / _Tp(2), __x.imag()));
|
||||
}
|
||||
if (std::isnan(__x.imag())) {
|
||||
if (std::isinf(__x.real()) || __x.real() == 0)
|
||||
return std::complex<_Tp>(copysign(_Tp(0), __x.real()), __x.imag());
|
||||
return std::complex<_Tp>(__x.imag(), __x.imag());
|
||||
}
|
||||
if (std::isnan(__x.real())) {
|
||||
return std::complex<_Tp>(__x.real(), __x.real());
|
||||
}
|
||||
if (std::isinf(__x.real())) {
|
||||
return std::complex<_Tp>(copysign(_Tp(0), __x.real()),
|
||||
copysign(__pi / _Tp(2), __x.imag()));
|
||||
}
|
||||
if (abs(__x.real()) == _Tp(1) && __x.imag() == _Tp(0)) {
|
||||
return std::complex<_Tp>(copysign(_Tp(INFINITY), __x.real()),
|
||||
copysign(_Tp(0), __x.imag()));
|
||||
}
|
||||
std::complex<_Tp> __z = log((_Tp(1) + __x) / (_Tp(1) - __x)) / _Tp(2);
|
||||
return std::complex<_Tp>(copysign(__z.real(), __x.real()),
|
||||
copysign(__z.imag(), __x.imag()));
|
||||
}
|
||||
|
||||
// sinh
|
||||
|
||||
template <class _Tp>
|
||||
__DEVICE__ std::complex<_Tp> sinh(const std::complex<_Tp> &__x) {
|
||||
if (std::isinf(__x.real()) && !std::isfinite(__x.imag()))
|
||||
return std::complex<_Tp>(__x.real(), _Tp(NAN));
|
||||
if (__x.real() == 0 && !std::isfinite(__x.imag()))
|
||||
return std::complex<_Tp>(__x.real(), _Tp(NAN));
|
||||
if (__x.imag() == 0 && !std::isfinite(__x.real()))
|
||||
return __x;
|
||||
return std::complex<_Tp>(sinh(__x.real()) * cos(__x.imag()),
|
||||
cosh(__x.real()) * sin(__x.imag()));
|
||||
}
|
||||
|
||||
// cosh
|
||||
|
||||
template <class _Tp>
|
||||
__DEVICE__ std::complex<_Tp> cosh(const std::complex<_Tp> &__x) {
|
||||
if (std::isinf(__x.real()) && !std::isfinite(__x.imag()))
|
||||
return std::complex<_Tp>(abs(__x.real()), _Tp(NAN));
|
||||
if (__x.real() == 0 && !std::isfinite(__x.imag()))
|
||||
return std::complex<_Tp>(_Tp(NAN), __x.real());
|
||||
if (__x.real() == 0 && __x.imag() == 0)
|
||||
return std::complex<_Tp>(_Tp(1), __x.imag());
|
||||
if (__x.imag() == 0 && !std::isfinite(__x.real()))
|
||||
return std::complex<_Tp>(abs(__x.real()), __x.imag());
|
||||
return std::complex<_Tp>(cosh(__x.real()) * cos(__x.imag()),
|
||||
sinh(__x.real()) * sin(__x.imag()));
|
||||
}
|
||||
|
||||
// tanh
|
||||
|
||||
template <class _Tp>
|
||||
__DEVICE__ std::complex<_Tp> tanh(const std::complex<_Tp> &__x) {
|
||||
if (std::isinf(__x.real())) {
|
||||
if (!std::isfinite(__x.imag()))
|
||||
return std::complex<_Tp>(_Tp(1), _Tp(0));
|
||||
return std::complex<_Tp>(_Tp(1),
|
||||
copysign(_Tp(0), sin(_Tp(2) * __x.imag())));
|
||||
}
|
||||
if (std::isnan(__x.real()) && __x.imag() == 0)
|
||||
return __x;
|
||||
_Tp __2r(_Tp(2) * __x.real());
|
||||
_Tp __2i(_Tp(2) * __x.imag());
|
||||
_Tp __d(cosh(__2r) + cos(__2i));
|
||||
_Tp __2rsh(sinh(__2r));
|
||||
if (std::isinf(__2rsh) && std::isinf(__d))
|
||||
return std::complex<_Tp>(__2rsh > _Tp(0) ? _Tp(1) : _Tp(-1),
|
||||
__2i > _Tp(0) ? _Tp(0) : _Tp(-0.));
|
||||
return std::complex<_Tp>(__2rsh / __d, sin(__2i) / __d);
|
||||
}
|
||||
|
||||
// asin
|
||||
|
||||
template <class _Tp>
|
||||
__DEVICE__ std::complex<_Tp> asin(const std::complex<_Tp> &__x) {
|
||||
std::complex<_Tp> __z = asinh(complex<_Tp>(-__x.imag(), __x.real()));
|
||||
return std::complex<_Tp>(__z.imag(), -__z.real());
|
||||
}
|
||||
|
||||
// acos
|
||||
|
||||
template <class _Tp>
|
||||
__DEVICE__ std::complex<_Tp> acos(const std::complex<_Tp> &__x) {
|
||||
const _Tp __pi(atan2(+0., -0.));
|
||||
if (std::isinf(__x.real())) {
|
||||
if (std::isnan(__x.imag()))
|
||||
return std::complex<_Tp>(__x.imag(), __x.real());
|
||||
if (std::isinf(__x.imag())) {
|
||||
if (__x.real() < _Tp(0))
|
||||
return std::complex<_Tp>(_Tp(0.75) * __pi, -__x.imag());
|
||||
return std::complex<_Tp>(_Tp(0.25) * __pi, -__x.imag());
|
||||
}
|
||||
if (__x.real() < _Tp(0))
|
||||
return std::complex<_Tp>(__pi,
|
||||
signbit(__x.imag()) ? -__x.real() : __x.real());
|
||||
return std::complex<_Tp>(_Tp(0),
|
||||
signbit(__x.imag()) ? __x.real() : -__x.real());
|
||||
}
|
||||
if (std::isnan(__x.real())) {
|
||||
if (std::isinf(__x.imag()))
|
||||
return std::complex<_Tp>(__x.real(), -__x.imag());
|
||||
return std::complex<_Tp>(__x.real(), __x.real());
|
||||
}
|
||||
if (std::isinf(__x.imag()))
|
||||
return std::complex<_Tp>(__pi / _Tp(2), -__x.imag());
|
||||
if (__x.real() == 0 && (__x.imag() == 0 || isnan(__x.imag())))
|
||||
return std::complex<_Tp>(__pi / _Tp(2), -__x.imag());
|
||||
std::complex<_Tp> __z = log(__x + sqrt(__sqr(__x) - _Tp(1)));
|
||||
if (signbit(__x.imag()))
|
||||
return std::complex<_Tp>(abs(__z.imag()), abs(__z.real()));
|
||||
return std::complex<_Tp>(abs(__z.imag()), -abs(__z.real()));
|
||||
}
|
||||
|
||||
// atan
|
||||
|
||||
template <class _Tp>
|
||||
__DEVICE__ std::complex<_Tp> atan(const std::complex<_Tp> &__x) {
|
||||
std::complex<_Tp> __z = atanh(complex<_Tp>(-__x.imag(), __x.real()));
|
||||
return std::complex<_Tp>(__z.imag(), -__z.real());
|
||||
}
|
||||
|
||||
// sin
|
||||
|
||||
template <class _Tp>
|
||||
__DEVICE__ std::complex<_Tp> sin(const std::complex<_Tp> &__x) {
|
||||
std::complex<_Tp> __z = sinh(complex<_Tp>(-__x.imag(), __x.real()));
|
||||
return std::complex<_Tp>(__z.imag(), -__z.real());
|
||||
}
|
||||
|
||||
// cos
|
||||
|
||||
template <class _Tp> std::complex<_Tp> cos(const std::complex<_Tp> &__x) {
|
||||
return cosh(complex<_Tp>(-__x.imag(), __x.real()));
|
||||
}
|
||||
|
||||
// tan
|
||||
|
||||
template <class _Tp>
|
||||
__DEVICE__ std::complex<_Tp> tan(const std::complex<_Tp> &__x) {
|
||||
std::complex<_Tp> __z = tanh(complex<_Tp>(-__x.imag(), __x.real()));
|
||||
return std::complex<_Tp>(__z.imag(), -__z.real());
|
||||
}
|
||||
|
||||
} // namespace std
|
||||
|
||||
#endif
|
||||
Vendored
+9
-2
@@ -13,6 +13,12 @@
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("popcnt")))
|
||||
|
||||
#if defined(__cplusplus) && (__cplusplus >= 201103L)
|
||||
#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
|
||||
#else
|
||||
#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
|
||||
#endif
|
||||
|
||||
/// Counts the number of bits in the source operand having a value of 1.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
@@ -23,7 +29,7 @@
|
||||
/// An unsigned 32-bit integer operand.
|
||||
/// \returns A 32-bit integer containing the number of bits with value 1 in the
|
||||
/// source operand.
|
||||
static __inline__ int __DEFAULT_FN_ATTRS
|
||||
static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_popcnt_u32(unsigned int __A)
|
||||
{
|
||||
return __builtin_popcount(__A);
|
||||
@@ -40,7 +46,7 @@ _mm_popcnt_u32(unsigned int __A)
|
||||
/// An unsigned 64-bit integer operand.
|
||||
/// \returns A 64-bit integer containing the number of bits with value 1 in the
|
||||
/// source operand.
|
||||
static __inline__ long long __DEFAULT_FN_ATTRS
|
||||
static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_popcnt_u64(unsigned long long __A)
|
||||
{
|
||||
return __builtin_popcountll(__A);
|
||||
@@ -48,5 +54,6 @@ _mm_popcnt_u64(unsigned long long __A)
|
||||
#endif /* __x86_64__ */
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
#undef __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
|
||||
#endif /* __POPCNTINTRIN_H */
|
||||
|
||||
@@ -78,6 +78,30 @@ extern __inline __m128i
|
||||
return (__m128i)vec_sel((__v16qu)__A, (__v16qu)__B, __lmask);
|
||||
}
|
||||
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_insert_epi8(__m128i const __A, int const __D, int const __N) {
|
||||
__v16qi result = (__v16qi)__A;
|
||||
result[__N & 0xf] = __D;
|
||||
return (__m128i)result;
|
||||
}
|
||||
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_insert_epi32(__m128i const __A, int const __D, int const __N) {
|
||||
__v4si result = (__v4si)__A;
|
||||
result[__N & 3] = __D;
|
||||
return (__m128i)result;
|
||||
}
|
||||
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_insert_epi64(__m128i const __A, long long const __D, int const __N) {
|
||||
__v2di result = (__v2di)__A;
|
||||
result[__N & 1] = __D;
|
||||
return (__m128i)result;
|
||||
}
|
||||
|
||||
#else
|
||||
#include_next <smmintrin.h>
|
||||
#endif /* defined(__linux__) && defined(__ppc64__) */
|
||||
|
||||
Vendored
+150
@@ -0,0 +1,150 @@
|
||||
/*===------------------ uintrintrin.h - UINTR intrinsics -------------------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __X86GPRINTRIN_H
|
||||
#error "Never use <uintrintrin.h> directly; include <x86gprintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef __UINTRINTRIN_H
|
||||
#define __UINTRINTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file */
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("uintr")))
|
||||
|
||||
#ifdef __x86_64__
|
||||
|
||||
/// Clears the user interrupt flag (UIF). Its effect takes place immediately: a
|
||||
/// user interrupt cannot be delivered on the instruction boundary following
|
||||
/// CLUI. Can be executed only if CR4.UINT = 1, the logical processor is in
|
||||
/// 64-bit mode, and software is not executing inside an enclave; otherwise,
|
||||
/// each causes an invalid-opcode exception. Causes a transactional abort if
|
||||
/// executed inside a transactional region; the abort loads EAX as it would
|
||||
/// had it been due to an execution of CLI.
|
||||
///
|
||||
/// \headerfile <x86gprintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> CLUI </c> instruction.
|
||||
///
|
||||
/// \operation
|
||||
/// UIF := 0
|
||||
/// \endoperation
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_clui (void)
|
||||
{
|
||||
__builtin_ia32_clui();
|
||||
}
|
||||
|
||||
/// Sets the user interrupt flag (UIF). Its effect takes place immediately; a
|
||||
/// user interrupt may be delivered on the instruction boundary following
|
||||
/// STUI. Can be executed only if CR4.UINT = 1, the logical processor is in
|
||||
/// 64-bit mode, and software is not executing inside an enclave; otherwise,
|
||||
/// each causes an invalid-opcode exception. Causes a transactional abort if
|
||||
/// executed inside a transactional region; the abort loads EAX as it would
|
||||
/// had it been due to an execution of STI.
|
||||
///
|
||||
/// \headerfile <x86gprintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> STUI </c> instruction.
|
||||
///
|
||||
/// \operation
|
||||
/// UIF := 1
|
||||
/// \endoperation
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_stui (void)
|
||||
{
|
||||
__builtin_ia32_stui();
|
||||
}
|
||||
|
||||
/// Get the current value of the user interrupt flag (UIF). Can be executed
|
||||
/// regardless of CPL and inside a transactional region. Can be executed only
|
||||
/// if CR4.UINT = 1, the logical processor is in 64-bit mode, and software is
|
||||
/// not executing inside an enclave; otherwise, it causes an invalid-opcode
|
||||
/// exception.
|
||||
///
|
||||
/// \headerfile <x86gprintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TESTUI </c> instruction.
|
||||
///
|
||||
/// \returns The current value of the user interrupt flag (UIF).
|
||||
///
|
||||
/// \operation
|
||||
/// CF := UIF
|
||||
/// ZF := 0
|
||||
/// AF := 0
|
||||
/// OF := 0
|
||||
/// PF := 0
|
||||
/// SF := 0
|
||||
/// dst := CF
|
||||
/// \endoperation
|
||||
static __inline__ unsigned char __DEFAULT_FN_ATTRS
|
||||
_testui (void)
|
||||
{
|
||||
return __builtin_ia32_testui();
|
||||
}
|
||||
|
||||
/// Send interprocessor user interrupt. Can be executed only if
|
||||
/// CR4.UINT = IA32_UINT_TT[0] = 1, the logical processor is in 64-bit mode,
|
||||
/// and software is not executing inside an enclave; otherwise, it causes an
|
||||
/// invalid-opcode exception. May be executed at any privilege level, all of
|
||||
/// its memory accesses are performed with supervisor privilege.
|
||||
///
|
||||
/// \headerfile <x86gprintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> SENDUIPI </c> instruction
|
||||
///
|
||||
/// \param __a
|
||||
/// Index of user-interrupt target table entry in user-interrupt target
|
||||
/// table.
|
||||
///
|
||||
/// \operation
|
||||
/// IF __a > UITTSZ
|
||||
/// GP (0)
|
||||
/// FI
|
||||
/// tempUITTE := MEM[UITTADDR + (a<<4)]
|
||||
/// // tempUITTE must be valid, and can't have any reserved bit set
|
||||
/// IF (tempUITTE.V == 0 OR tempUITTE[7:1] != 0)
|
||||
/// GP (0)
|
||||
/// FI
|
||||
/// tempUPID := MEM[tempUITTE.UPIDADDR] // under lock
|
||||
/// // tempUPID can't have any reserved bit set
|
||||
/// IF (tempUPID[15:2] != 0 OR tempUPID[31:24] != 0)
|
||||
/// GP (0) // release lock
|
||||
/// FI
|
||||
/// tempUPID.PIR[tempUITTE.UV] := 1;
|
||||
/// IF (tempUPID.SN == 0 AND tempUPID.ON == 0)
|
||||
/// tempUPID.ON := 1
|
||||
/// sendNotify := 1
|
||||
/// ELSE
|
||||
/// sendNotify := 0
|
||||
/// FI
|
||||
/// MEM[tempUITTE.UPIDADDR] := tempUPID // release lock
|
||||
/// IF sendNotify == 1
|
||||
/// IF IA32_APIC_BASE[10] == 1 // local APIC is in x2APIC mode
|
||||
/// // send ordinary IPI with vector tempUPID.NV to 32-bit physical APIC
|
||||
/// // ID tempUPID.NDST
|
||||
/// SendOrdinaryIPI(tempUPID.NV, tempUPID.NDST)
|
||||
/// ELSE
|
||||
/// // send ordinary IPI with vector tempUPID.NV to 8-bit physical APIC
|
||||
/// // ID tempUPID.NDST[15:8]
|
||||
/// SendOrdinaryIPI(tempUPID.NV, tempUPID.NDST[15:8])
|
||||
/// FI
|
||||
/// FI
|
||||
/// \endoperation
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_senduipi (unsigned long long __a)
|
||||
{
|
||||
__builtin_ia32_senduipi(__a);
|
||||
}
|
||||
|
||||
#endif /* __x86_64__ */
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif /* __UINTRINTRIN_H */
|
||||
Vendored
+73
-39
@@ -18,8 +18,7 @@ typedef int32_t v128_t __attribute__((__vector_size__(16), __aligned__(16)));
|
||||
|
||||
// Internal types determined by clang builtin definitions
|
||||
typedef int32_t __v128_u __attribute__((__vector_size__(16), __aligned__(1)));
|
||||
typedef char __i8x16 __attribute__((__vector_size__(16), __aligned__(16)));
|
||||
typedef signed char __s8x16
|
||||
typedef signed char __i8x16
|
||||
__attribute__((__vector_size__(16), __aligned__(16)));
|
||||
typedef unsigned char __u8x16
|
||||
__attribute__((__vector_size__(16), __aligned__(16)));
|
||||
@@ -35,6 +34,13 @@ typedef unsigned long long __u64x2
|
||||
typedef float __f32x4 __attribute__((__vector_size__(16), __aligned__(16)));
|
||||
typedef double __f64x2 __attribute__((__vector_size__(16), __aligned__(16)));
|
||||
|
||||
typedef signed char __i8x8 __attribute__((__vector_size__(8), __aligned__(8)));
|
||||
typedef unsigned char __u8x8
|
||||
__attribute__((__vector_size__(8), __aligned__(8)));
|
||||
typedef short __i16x4 __attribute__((__vector_size__(8), __aligned__(8)));
|
||||
typedef unsigned short __u16x4
|
||||
__attribute__((__vector_size__(8), __aligned__(8)));
|
||||
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("simd128"), \
|
||||
__min_vector_width__(128)))
|
||||
@@ -273,7 +279,7 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_splat(int8_t __a) {
|
||||
(__builtin_wasm_extract_lane_s_i8x16((__i8x16)(__a), __i))
|
||||
|
||||
#define wasm_u8x16_extract_lane(__a, __i) \
|
||||
(__builtin_wasm_extract_lane_u_i8x16((__i8x16)(__a), __i))
|
||||
(__builtin_wasm_extract_lane_u_i8x16((__u8x16)(__a), __i))
|
||||
|
||||
#define wasm_i8x16_replace_lane(__a, __i, __b) \
|
||||
((v128_t)__builtin_wasm_replace_lane_i8x16((__i8x16)(__a), __i, __b))
|
||||
@@ -286,7 +292,7 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_splat(int16_t __a) {
|
||||
(__builtin_wasm_extract_lane_s_i16x8((__i16x8)(__a), __i))
|
||||
|
||||
#define wasm_u16x8_extract_lane(__a, __i) \
|
||||
(__builtin_wasm_extract_lane_u_i16x8((__i16x8)(__a), __i))
|
||||
(__builtin_wasm_extract_lane_u_i16x8((__u16x8)(__a), __i))
|
||||
|
||||
#define wasm_i16x8_replace_lane(__a, __i, __b) \
|
||||
((v128_t)__builtin_wasm_replace_lane_i16x8((__i16x8)(__a), __i, __b))
|
||||
@@ -333,17 +339,17 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f64x2_splat(double __a) {
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_eq(v128_t __a,
|
||||
v128_t __b) {
|
||||
return (v128_t)((__s8x16)__a == (__s8x16)__b);
|
||||
return (v128_t)((__i8x16)__a == (__i8x16)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_ne(v128_t __a,
|
||||
v128_t __b) {
|
||||
return (v128_t)((__s8x16)__a != (__s8x16)__b);
|
||||
return (v128_t)((__i8x16)__a != (__i8x16)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_lt(v128_t __a,
|
||||
v128_t __b) {
|
||||
return (v128_t)((__s8x16)__a < (__s8x16)__b);
|
||||
return (v128_t)((__i8x16)__a < (__i8x16)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_lt(v128_t __a,
|
||||
@@ -353,7 +359,7 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_lt(v128_t __a,
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_gt(v128_t __a,
|
||||
v128_t __b) {
|
||||
return (v128_t)((__s8x16)__a > (__s8x16)__b);
|
||||
return (v128_t)((__i8x16)__a > (__i8x16)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_gt(v128_t __a,
|
||||
@@ -363,7 +369,7 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_gt(v128_t __a,
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_le(v128_t __a,
|
||||
v128_t __b) {
|
||||
return (v128_t)((__s8x16)__a <= (__s8x16)__b);
|
||||
return (v128_t)((__i8x16)__a <= (__i8x16)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_le(v128_t __a,
|
||||
@@ -373,7 +379,7 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_le(v128_t __a,
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_ge(v128_t __a,
|
||||
v128_t __b) {
|
||||
return (v128_t)((__s8x16)__a >= (__s8x16)__b);
|
||||
return (v128_t)((__i8x16)__a >= (__i8x16)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_ge(v128_t __a,
|
||||
@@ -595,7 +601,7 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_shl(v128_t __a,
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_shr(v128_t __a,
|
||||
int32_t __b) {
|
||||
return (v128_t)((__s8x16)__a >> __b);
|
||||
return (v128_t)((__i8x16)__a >> __b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_shr(v128_t __a,
|
||||
@@ -616,8 +622,8 @@ wasm_i8x16_add_saturate(v128_t __a, v128_t __b) {
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS
|
||||
wasm_u8x16_add_saturate(v128_t __a, v128_t __b) {
|
||||
return (v128_t)__builtin_wasm_add_saturate_u_i8x16((__i8x16)__a,
|
||||
(__i8x16)__b);
|
||||
return (v128_t)__builtin_wasm_add_saturate_u_i8x16((__u8x16)__a,
|
||||
(__u8x16)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_sub(v128_t __a,
|
||||
@@ -633,8 +639,8 @@ wasm_i8x16_sub_saturate(v128_t __a, v128_t __b) {
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS
|
||||
wasm_u8x16_sub_saturate(v128_t __a, v128_t __b) {
|
||||
return (v128_t)__builtin_wasm_sub_saturate_u_i8x16((__i8x16)__a,
|
||||
(__i8x16)__b);
|
||||
return (v128_t)__builtin_wasm_sub_saturate_u_i8x16((__u8x16)__a,
|
||||
(__u8x16)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_min(v128_t __a,
|
||||
@@ -644,7 +650,7 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_min(v128_t __a,
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_min(v128_t __a,
|
||||
v128_t __b) {
|
||||
return (v128_t)__builtin_wasm_min_u_i8x16((__i8x16)__a, (__i8x16)__b);
|
||||
return (v128_t)__builtin_wasm_min_u_i8x16((__u8x16)__a, (__u8x16)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_max(v128_t __a,
|
||||
@@ -654,12 +660,12 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_max(v128_t __a,
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_max(v128_t __a,
|
||||
v128_t __b) {
|
||||
return (v128_t)__builtin_wasm_max_u_i8x16((__i8x16)__a, (__i8x16)__b);
|
||||
return (v128_t)__builtin_wasm_max_u_i8x16((__u8x16)__a, (__u8x16)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_avgr(v128_t __a,
|
||||
v128_t __b) {
|
||||
return (v128_t)__builtin_wasm_avgr_u_i8x16((__i8x16)__a, (__i8x16)__b);
|
||||
return (v128_t)__builtin_wasm_avgr_u_i8x16((__u8x16)__a, (__u8x16)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_abs(v128_t __a) {
|
||||
@@ -706,8 +712,8 @@ wasm_i16x8_add_saturate(v128_t __a, v128_t __b) {
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS
|
||||
wasm_u16x8_add_saturate(v128_t __a, v128_t __b) {
|
||||
return (v128_t)__builtin_wasm_add_saturate_u_i16x8((__i16x8)__a,
|
||||
(__i16x8)__b);
|
||||
return (v128_t)__builtin_wasm_add_saturate_u_i16x8((__u16x8)__a,
|
||||
(__u16x8)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_sub(v128_t __a,
|
||||
@@ -723,8 +729,8 @@ wasm_i16x8_sub_saturate(v128_t __a, v128_t __b) {
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS
|
||||
wasm_u16x8_sub_saturate(v128_t __a, v128_t __b) {
|
||||
return (v128_t)__builtin_wasm_sub_saturate_u_i16x8((__i16x8)__a,
|
||||
(__i16x8)__b);
|
||||
return (v128_t)__builtin_wasm_sub_saturate_u_i16x8((__u16x8)__a,
|
||||
(__u16x8)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_mul(v128_t __a,
|
||||
@@ -739,7 +745,7 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_min(v128_t __a,
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_min(v128_t __a,
|
||||
v128_t __b) {
|
||||
return (v128_t)__builtin_wasm_min_u_i16x8((__i16x8)__a, (__i16x8)__b);
|
||||
return (v128_t)__builtin_wasm_min_u_i16x8((__u16x8)__a, (__u16x8)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_max(v128_t __a,
|
||||
@@ -749,12 +755,12 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_max(v128_t __a,
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_max(v128_t __a,
|
||||
v128_t __b) {
|
||||
return (v128_t)__builtin_wasm_max_u_i16x8((__i16x8)__a, (__i16x8)__b);
|
||||
return (v128_t)__builtin_wasm_max_u_i16x8((__u16x8)__a, (__u16x8)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_avgr(v128_t __a,
|
||||
v128_t __b) {
|
||||
return (v128_t)__builtin_wasm_avgr_u_i16x8((__i16x8)__a, (__i16x8)__b);
|
||||
return (v128_t)__builtin_wasm_avgr_u_i16x8((__u16x8)__a, (__u16x8)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_abs(v128_t __a) {
|
||||
@@ -810,7 +816,7 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_min(v128_t __a,
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u32x4_min(v128_t __a,
|
||||
v128_t __b) {
|
||||
return (v128_t)__builtin_wasm_min_u_i32x4((__i32x4)__a, (__i32x4)__b);
|
||||
return (v128_t)__builtin_wasm_min_u_i32x4((__u32x4)__a, (__u32x4)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_max(v128_t __a,
|
||||
@@ -820,7 +826,7 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_max(v128_t __a,
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u32x4_max(v128_t __a,
|
||||
v128_t __b) {
|
||||
return (v128_t)__builtin_wasm_max_u_i32x4((__i32x4)__a, (__i32x4)__b);
|
||||
return (v128_t)__builtin_wasm_max_u_i32x4((__u32x4)__a, (__u32x4)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_neg(v128_t __a) {
|
||||
@@ -1071,8 +1077,8 @@ wasm_i8x16_narrow_i16x8(v128_t __a, v128_t __b) {
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS
|
||||
wasm_u8x16_narrow_i16x8(v128_t __a, v128_t __b) {
|
||||
return (v128_t)__builtin_wasm_narrow_u_i8x16_i16x8((__i16x8)__a,
|
||||
(__i16x8)__b);
|
||||
return (v128_t)__builtin_wasm_narrow_u_i8x16_i16x8((__u16x8)__a,
|
||||
(__u16x8)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS
|
||||
@@ -1083,48 +1089,76 @@ wasm_i16x8_narrow_i32x4(v128_t __a, v128_t __b) {
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS
|
||||
wasm_u16x8_narrow_i32x4(v128_t __a, v128_t __b) {
|
||||
return (v128_t)__builtin_wasm_narrow_u_i16x8_i32x4((__i32x4)__a,
|
||||
(__i32x4)__b);
|
||||
return (v128_t)__builtin_wasm_narrow_u_i16x8_i32x4((__u32x4)__a,
|
||||
(__u32x4)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS
|
||||
wasm_i16x8_widen_low_i8x16(v128_t __a) {
|
||||
return (v128_t)__builtin_wasm_widen_low_s_i16x8_i8x16((__i8x16)__a);
|
||||
return (v128_t) __builtin_convertvector(
|
||||
(__i8x8){((__i8x16)__a)[0], ((__i8x16)__a)[1], ((__i8x16)__a)[2],
|
||||
((__i8x16)__a)[3], ((__i8x16)__a)[4], ((__i8x16)__a)[5],
|
||||
((__i8x16)__a)[6], ((__i8x16)__a)[7]},
|
||||
__i16x8);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS
|
||||
wasm_i16x8_widen_high_i8x16(v128_t __a) {
|
||||
return (v128_t)__builtin_wasm_widen_high_s_i16x8_i8x16((__i8x16)__a);
|
||||
return (v128_t) __builtin_convertvector(
|
||||
(__i8x8){((__i8x16)__a)[8], ((__i8x16)__a)[9], ((__i8x16)__a)[10],
|
||||
((__i8x16)__a)[11], ((__i8x16)__a)[12], ((__i8x16)__a)[13],
|
||||
((__i8x16)__a)[14], ((__i8x16)__a)[15]},
|
||||
__i16x8);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS
|
||||
wasm_i16x8_widen_low_u8x16(v128_t __a) {
|
||||
return (v128_t)__builtin_wasm_widen_low_u_i16x8_i8x16((__i8x16)__a);
|
||||
return (v128_t) __builtin_convertvector(
|
||||
(__u8x8){((__u8x16)__a)[0], ((__u8x16)__a)[1], ((__u8x16)__a)[2],
|
||||
((__u8x16)__a)[3], ((__u8x16)__a)[4], ((__u8x16)__a)[5],
|
||||
((__u8x16)__a)[6], ((__u8x16)__a)[7]},
|
||||
__u16x8);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS
|
||||
wasm_i16x8_widen_high_u8x16(v128_t __a) {
|
||||
return (v128_t)__builtin_wasm_widen_high_u_i16x8_i8x16((__i8x16)__a);
|
||||
return (v128_t) __builtin_convertvector(
|
||||
(__u8x8){((__u8x16)__a)[8], ((__u8x16)__a)[9], ((__u8x16)__a)[10],
|
||||
((__u8x16)__a)[11], ((__u8x16)__a)[12], ((__u8x16)__a)[13],
|
||||
((__u8x16)__a)[14], ((__u8x16)__a)[15]},
|
||||
__u16x8);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS
|
||||
wasm_i32x4_widen_low_i16x8(v128_t __a) {
|
||||
return (v128_t)__builtin_wasm_widen_low_s_i32x4_i16x8((__i16x8)__a);
|
||||
return (v128_t) __builtin_convertvector(
|
||||
(__i16x4){((__i16x8)__a)[0], ((__i16x8)__a)[1], ((__i16x8)__a)[2],
|
||||
((__i16x8)__a)[3]},
|
||||
__i32x4);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS
|
||||
wasm_i32x4_widen_high_i16x8(v128_t __a) {
|
||||
return (v128_t)__builtin_wasm_widen_high_s_i32x4_i16x8((__i16x8)__a);
|
||||
return (v128_t) __builtin_convertvector(
|
||||
(__i16x4){((__i16x8)__a)[4], ((__i16x8)__a)[5], ((__i16x8)__a)[6],
|
||||
((__i16x8)__a)[7]},
|
||||
__i32x4);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS
|
||||
wasm_i32x4_widen_low_u16x8(v128_t __a) {
|
||||
return (v128_t)__builtin_wasm_widen_low_u_i32x4_i16x8((__i16x8)__a);
|
||||
return (v128_t) __builtin_convertvector(
|
||||
(__u16x4){((__u16x8)__a)[0], ((__u16x8)__a)[1], ((__u16x8)__a)[2],
|
||||
((__u16x8)__a)[3]},
|
||||
__u32x4);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS
|
||||
wasm_i32x4_widen_high_u16x8(v128_t __a) {
|
||||
return (v128_t)__builtin_wasm_widen_high_u_i32x4_i16x8((__i16x8)__a);
|
||||
return (v128_t) __builtin_convertvector(
|
||||
(__u16x4){((__u16x8)__a)[4], ((__u16x8)__a)[5], ((__u16x8)__a)[6],
|
||||
((__u16x8)__a)[7]},
|
||||
__u32x4);
|
||||
}
|
||||
|
||||
// Undefine helper macros
|
||||
|
||||
Vendored
+23
@@ -0,0 +1,23 @@
|
||||
/*===--------------- x86gprintrin.h - X86 GPR intrinsics ------------------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __X86GPRINTRIN_H
|
||||
#define __X86GPRINTRIN_H
|
||||
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__HRESET__)
|
||||
#include <hresetintrin.h>
|
||||
#endif
|
||||
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__UINTR__)
|
||||
#include <uintrintrin.h>
|
||||
#endif
|
||||
|
||||
#endif /* __X86GPRINTRIN_H */
|
||||
Reference in New Issue
Block a user