未验证 提交 495e7f9c 编写于 作者: W wuhuanzhou 提交者: GitHub

Update eigen version to f612df27 (#31832)

* update eigen version to f612df27, test=develop

* fix compilation error, test=develop

* remove patch command in eigen, test=develop

* fix compilation error caused by call Eigen function with float16 and bfloat16, test=develop

* fix unittest error, test=develop

* fix unittest error caused by precision, test=develop

* remove patch files used by old version eigen, test=develop
上级 587d99ae
......@@ -14,11 +14,11 @@
include(ExternalProject)
# update eigen to the commit id 4da2c6b1 on 03/19/2020
# update eigen to the commit id f612df27 on 03/16/2021
set(EIGEN_PREFIX_DIR ${THIRD_PARTY_PATH}/eigen3)
set(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3/src/extern_eigen3)
set(EIGEN_REPOSITORY https://gitlab.com/libeigen/eigen.git)
set(EIGEN_TAG 4da2c6b1974827b1999bab652a3d4703e1992d26)
set(EIGEN_TAG f612df273689a19d25b45ca4f8269463207c4fee)
cache_third_party(extern_eigen3
REPOSITORY ${EIGEN_REPOSITORY}
......@@ -27,48 +27,6 @@ cache_third_party(extern_eigen3
if(WIN32)
add_definitions(-DEIGEN_STRONG_INLINE=inline)
file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Half.h native_src)
file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/arch/CUDA/Half.h native_dst)
# For Windows
# which will cause a compilation error in Tensor:74:
# "can not open file 'unistd.h'"
# so use following patch to solve compilation error On Windows.
file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Tensor native_src2)
file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/unsupported/Eigen/CXX11/Tensor native_dst2)
# For VS2015
# which will cause a compilation error in TensorBlock.h:1028:
# "syntax error"
# so use following patch to solve compilation error On Windows.
file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/TensorBlock.h native_src3)
file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h native_dst3)
set(EIGEN_PATCH_COMMAND copy ${native_src} ${native_dst} /Y && copy ${native_src2} ${native_dst2} /Y && copy ${native_src3} ${native_dst3} /Y)
elseif(LINUX)
# For gxx=4.8, __GXX_ABI_VERSION is less than 1004
# which will cause a compilation error in Geometry_SSE.h:38:
# "no matching function for call to 'pmul(Eigen::internal::Packet4f&, __m128)"
# refer to: https://gitlab.com/libeigen/eigen/-/blob/4da2c6b1974827b1999bab652a3d4703e1992d26/Eigen/src/Core/arch/SSE/PacketMath.h#L33-60
# add -fabi-version=4 could avoid above error, but will cause "double free corruption" when compile with gcc8
# so use following patch to solve compilation error with different version of gcc.
file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Geometry_SSE.h native_src1)
file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Geometry/arch/Geometry_SSE.h native_dst1)
# The compiler fully support const expressions since c++14,
# but Eigen use some const expressions such as std::max and std::min, which are not supported in c++11
# add patch to avoid compilation error in c++11
file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/MathFunctions.h native_src2)
file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/MathFunctions.h native_dst2)
if(WITH_ROCM)
# For HIPCC Eigen::internal::device::numeric_limits is not EIGEN_DEVICE_FUNC
# which will cause compiler error of using __host__ funciont in __host__ __device__
file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Meta.h native_src3)
file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/util/Meta.h native_dst3)
# For HIPCC Eigen::internal::scalar_sum_op<bool,bool> is not EIGEN_DEVICE_FUNC
# which will cause compiler error of using __host__ funciont in __host__ __device__
file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/BinaryFunctors.h native_src4)
file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/functors/BinaryFunctors.h native_dst4)
set(EIGEN_PATCH_COMMAND cp ${native_src1} ${native_dst1} && cp ${native_src2} ${native_dst2} && cp ${native_src3} ${native_dst3} && cp ${native_src4} ${native_dst4})
else()
set(EIGEN_PATCH_COMMAND cp ${native_src1} ${native_dst1} && cp ${native_src2} ${native_dst2})
endif()
endif()
set(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR})
......@@ -82,7 +40,7 @@ ExternalProject_Add(
PREFIX ${EIGEN_PREFIX_DIR}
SOURCE_DIR ${EIGEN_SOURCE_DIR}
UPDATE_COMMAND ""
PATCH_COMMAND ${EIGEN_PATCH_COMMAND}
PATCH_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
......
......@@ -400,7 +400,7 @@ struct HardShrinkFunctor : public BaseActivationFunctor<T> {
void operator()(Device d, X x, Out out) const {
auto temp1 = x < static_cast<T>(threshold * -1.f);
auto temp2 = x > static_cast<T>(threshold);
out.device(d) = x * (temp1 + temp2 > 0).template cast<T>();
out.device(d) = x * (temp1 + temp2).template cast<T>();
}
};
......@@ -417,7 +417,7 @@ struct HardShrinkGradFunctor : public BaseActivationFunctor<T> {
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
auto temp1 = x < static_cast<T>(threshold * -1.f);
auto temp2 = x > static_cast<T>(threshold);
dx.device(d) = dout * (temp1 + temp2 > 0).template cast<T>();
dx.device(d) = dout * (temp1 + temp2).template cast<T>();
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
......
......@@ -24,7 +24,6 @@
namespace Eigen {
using bfloat16 = paddle::platform::bfloat16;
using complex64 = paddle::platform::complex64;
using complex128 = paddle::platform::complex128;
using float16 = paddle::platform::float16;
......@@ -33,7 +32,8 @@ template <typename T>
struct NumTraits;
template <>
struct NumTraits<bfloat16> : GenericNumTraits<bfloat16> {
struct NumTraits<paddle::platform::bfloat16>
: GenericNumTraits<paddle::platform::bfloat16> {
enum {
IsSigned = true,
IsInteger = false,
......@@ -41,22 +41,22 @@ struct NumTraits<bfloat16> : GenericNumTraits<bfloat16> {
RequireInitialization = false
};
HOSTDEVICE static inline bfloat16 epsilon() {
HOSTDEVICE static inline paddle::platform::bfloat16 epsilon() {
return paddle::platform::raw_uint16_to_bfloat16(0x3400);
}
HOSTDEVICE static inline bfloat16 dummy_precision() {
return bfloat16(1e-5f);
HOSTDEVICE static inline paddle::platform::bfloat16 dummy_precision() {
return paddle::platform::bfloat16(1e-5f);
}
HOSTDEVICE static inline bfloat16 highest() {
HOSTDEVICE static inline paddle::platform::bfloat16 highest() {
return paddle::platform::raw_uint16_to_bfloat16(0x7f7f);
}
HOSTDEVICE static inline bfloat16 lowest() {
HOSTDEVICE static inline paddle::platform::bfloat16 lowest() {
return paddle::platform::raw_uint16_to_bfloat16(0xff7f);
}
HOSTDEVICE static inline bfloat16 infinity() {
HOSTDEVICE static inline paddle::platform::bfloat16 infinity() {
return paddle::platform::raw_uint16_to_bfloat16(0x7f80);
}
HOSTDEVICE static inline bfloat16 quiet_NaN() {
HOSTDEVICE static inline paddle::platform::bfloat16 quiet_NaN() {
return paddle::platform::raw_uint16_to_bfloat16(0xffc1);
}
};
......@@ -137,68 +137,91 @@ namespace numext {
//////////// bfloat methods /////////////
template <>
HOSTDEVICE inline bool(isnan)(const bfloat16& a) {
HOSTDEVICE inline bool(isnan)(const paddle::platform::bfloat16& a) {
return (paddle::platform::isnan)(a);
}
template <>
HOSTDEVICE inline bool(isinf)(const bfloat16& a) {
HOSTDEVICE inline bool(isinf)(const paddle::platform::bfloat16& a) {
return (paddle::platform::isinf)(a);
}
template <>
HOSTDEVICE inline bool(isfinite)(const bfloat16& a) {
HOSTDEVICE inline bool(isfinite)(const paddle::platform::bfloat16& a) {
return (paddle::platform::isfinite)(a);
}
template <>
HOSTDEVICE inline bfloat16 exp(const bfloat16& a) {
return bfloat16(::expf(static_cast<float>(a)));
HOSTDEVICE inline paddle::platform::bfloat16 exp(
const paddle::platform::bfloat16& a) {
return paddle::platform::bfloat16(::expf(static_cast<float>(a)));
}
template <>
HOSTDEVICE inline bfloat16 erf(const bfloat16& a) {
return bfloat16(::erff(static_cast<float>(a)));
HOSTDEVICE inline paddle::platform::bfloat16 erf(
const paddle::platform::bfloat16& a) {
return paddle::platform::bfloat16(::erff(static_cast<float>(a)));
}
template <>
HOSTDEVICE inline bfloat16 log(const bfloat16& a) {
return bfloat16(::logf(static_cast<float>(a)));
HOSTDEVICE inline paddle::platform::bfloat16 log(
const paddle::platform::bfloat16& a) {
return paddle::platform::bfloat16(::logf(static_cast<float>(a)));
}
template <>
HOSTDEVICE inline bfloat16 tanh(const bfloat16& a) {
return bfloat16(::tanhf(static_cast<float>(a)));
HOSTDEVICE inline paddle::platform::bfloat16 tanh(
const paddle::platform::bfloat16& a) {
return paddle::platform::bfloat16(::tanhf(static_cast<float>(a)));
}
template <>
HOSTDEVICE inline bfloat16 sqrt(const bfloat16& a) {
return bfloat16(::sqrtf(static_cast<float>(a)));
HOSTDEVICE inline paddle::platform::bfloat16 sqrt(
const paddle::platform::bfloat16& a) {
return paddle::platform::bfloat16(::sqrtf(static_cast<float>(a)));
}
template <>
HOSTDEVICE inline bfloat16 ceil(const bfloat16& a) {
return bfloat16(::ceilf(static_cast<float>(a)));
HOSTDEVICE inline paddle::platform::bfloat16 ceil(
const paddle::platform::bfloat16& a) {
return paddle::platform::bfloat16(::ceilf(static_cast<float>(a)));
}
template <>
HOSTDEVICE inline bfloat16 floor(const bfloat16& a) {
return bfloat16(::floorf(static_cast<float>(a)));
HOSTDEVICE inline paddle::platform::bfloat16 floor(
const paddle::platform::bfloat16& a) {
return paddle::platform::bfloat16(::floorf(static_cast<float>(a)));
}
template <>
HOSTDEVICE inline bfloat16 round(const bfloat16& a) {
return bfloat16(::roundf(static_cast<float>(a)));
HOSTDEVICE inline paddle::platform::bfloat16 round(
const paddle::platform::bfloat16& a) {
return paddle::platform::bfloat16(::roundf(static_cast<float>(a)));
}
template <>
HOSTDEVICE inline bfloat16 pow(const bfloat16& a, const bfloat16& b) {
return bfloat16(::powf(static_cast<float>(a), static_cast<float>(b)));
HOSTDEVICE inline paddle::platform::bfloat16 pow(
const paddle::platform::bfloat16& a, const paddle::platform::bfloat16& b) {
return paddle::platform::bfloat16(
::powf(static_cast<float>(a), static_cast<float>(b)));
}
template <>
HOSTDEVICE inline bfloat16 abs(const bfloat16& a) {
return bfloat16(::fabs(static_cast<float>(a)));
HOSTDEVICE inline paddle::platform::bfloat16 abs(
const paddle::platform::bfloat16& a) {
return paddle::platform::bfloat16(::fabs(static_cast<float>(a)));
}
template <>
HOSTDEVICE inline paddle::platform::bfloat16 mini(
const paddle::platform::bfloat16& a, const paddle::platform::bfloat16& b) {
return b < a ? b : a;
}
template <>
HOSTDEVICE inline paddle::platform::bfloat16 maxi(
const paddle::platform::bfloat16& a, const paddle::platform::bfloat16& b) {
return a < b ? b : a;
}
//////////// complex64 methods /////////////
......@@ -398,5 +421,15 @@ HOSTDEVICE inline float16 abs(const float16& a) {
return float16(::fabs(static_cast<float>(a)));
}
template <>
HOSTDEVICE inline float16 mini(const float16& a, const float16& b) {
return b < a ? b : a;
}
template <>
HOSTDEVICE inline float16 maxi(const float16& a, const float16& b) {
return a < b ? b : a;
}
} // namespace numext
} // namespace Eigen
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// clang-format off
#ifndef EIGEN_BINARY_FUNCTORS_H
#define EIGEN_BINARY_FUNCTORS_H
namespace Eigen {
namespace internal {
//---------- associative binary functors ----------
template<typename Arg1, typename Arg2>
struct binary_op_base
{
typedef Arg1 first_argument_type;
typedef Arg2 second_argument_type;
};
/** \internal
* \brief Template functor to compute the sum of two scalars
*
* \sa class CwiseBinaryOp, MatrixBase::operator+, class VectorwiseOp, DenseBase::sum()
*/
template<typename LhsScalar,typename RhsScalar>
struct scalar_sum_op : binary_op_base<LhsScalar,RhsScalar>
{
typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_sum_op>::ReturnType result_type;
#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
EIGEN_EMPTY_STRUCT_CTOR(scalar_sum_op)
#else
scalar_sum_op() {
EIGEN_SCALAR_BINARY_OP_PLUGIN
}
#endif
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a + b; }
template<typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
{ return internal::padd(a,b); }
template<typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
{ return internal::predux(a); }
};
template<typename LhsScalar,typename RhsScalar>
struct functor_traits<scalar_sum_op<LhsScalar,RhsScalar> > {
enum {
Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2, // rough estimate!
PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasAdd && packet_traits<RhsScalar>::HasAdd
// TODO vectorize mixed sum
};
};
/** \internal
* \brief Template specialization to deprecate the summation of boolean expressions.
* This is required to solve Bug 426.
* \sa DenseBase::count(), DenseBase::any(), ArrayBase::cast(), MatrixBase::cast()
*/
template<> struct scalar_sum_op<bool,bool> : scalar_sum_op<int,int> {
EIGEN_DEPRECATED EIGEN_DEVICE_FUNC
scalar_sum_op() {}
};
/** \internal
* \brief Template functor to compute the product of two scalars
*
* \sa class CwiseBinaryOp, Cwise::operator*(), class VectorwiseOp, MatrixBase::redux()
*/
template<typename LhsScalar,typename RhsScalar>
struct scalar_product_op : binary_op_base<LhsScalar,RhsScalar>
{
typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_product_op>::ReturnType result_type;
#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op)
#else
scalar_product_op() {
EIGEN_SCALAR_BINARY_OP_PLUGIN
}
#endif
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a * b; }
template<typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
{ return internal::pmul(a,b); }
template<typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
{ return internal::predux_mul(a); }
};
template<typename LhsScalar,typename RhsScalar>
struct functor_traits<scalar_product_op<LhsScalar,RhsScalar> > {
enum {
Cost = (NumTraits<LhsScalar>::MulCost + NumTraits<RhsScalar>::MulCost)/2, // rough estimate!
PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasMul && packet_traits<RhsScalar>::HasMul
// TODO vectorize mixed product
};
};
/** \internal
* \brief Template functor to compute the conjugate product of two scalars
*
* This is a short cut for conj(x) * y which is needed for optimization purpose; in Eigen2 support mode, this becomes x * conj(y)
*/
template<typename LhsScalar,typename RhsScalar>
struct scalar_conj_product_op : binary_op_base<LhsScalar,RhsScalar>
{
enum {
Conj = NumTraits<LhsScalar>::IsComplex
};
typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_conj_product_op>::ReturnType result_type;
EIGEN_EMPTY_STRUCT_CTOR(scalar_conj_product_op)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const
{ return conj_helper<LhsScalar,RhsScalar,Conj,false>().pmul(a,b); }
template<typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
{ return conj_helper<Packet,Packet,Conj,false>().pmul(a,b); }
};
template<typename LhsScalar,typename RhsScalar>
struct functor_traits<scalar_conj_product_op<LhsScalar,RhsScalar> > {
enum {
Cost = NumTraits<LhsScalar>::MulCost,
PacketAccess = internal::is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasMul
};
};
/** \internal
* \brief Template functor to compute the min of two scalars
*
* \sa class CwiseBinaryOp, MatrixBase::cwiseMin, class VectorwiseOp, MatrixBase::minCoeff()
*/
template<typename LhsScalar,typename RhsScalar>
struct scalar_min_op : binary_op_base<LhsScalar,RhsScalar>
{
typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_min_op>::ReturnType result_type;
EIGEN_EMPTY_STRUCT_CTOR(scalar_min_op)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return numext::mini(a, b); }
template<typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
{ return internal::pmin(a,b); }
template<typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
{ return internal::predux_min(a); }
};
template<typename LhsScalar,typename RhsScalar>
struct functor_traits<scalar_min_op<LhsScalar,RhsScalar> > {
enum {
Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
PacketAccess = internal::is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasMin
};
};
/** \internal
* \brief Template functor to compute the max of two scalars
*
* \sa class CwiseBinaryOp, MatrixBase::cwiseMax, class VectorwiseOp, MatrixBase::maxCoeff()
*/
template<typename LhsScalar,typename RhsScalar>
struct scalar_max_op : binary_op_base<LhsScalar,RhsScalar>
{
typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_max_op>::ReturnType result_type;
EIGEN_EMPTY_STRUCT_CTOR(scalar_max_op)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return numext::maxi(a, b); }
template<typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
{ return internal::pmax(a,b); }
template<typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
{ return internal::predux_max(a); }
};
template<typename LhsScalar,typename RhsScalar>
struct functor_traits<scalar_max_op<LhsScalar,RhsScalar> > {
enum {
Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
PacketAccess = internal::is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasMax
};
};
/** \internal
* \brief Template functors for comparison of two scalars
* \todo Implement packet-comparisons
*/
template<typename LhsScalar, typename RhsScalar, ComparisonName cmp> struct scalar_cmp_op;
template<typename LhsScalar, typename RhsScalar, ComparisonName cmp>
struct functor_traits<scalar_cmp_op<LhsScalar,RhsScalar, cmp> > {
enum {
Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
PacketAccess = false
};
};
template<ComparisonName Cmp, typename LhsScalar, typename RhsScalar>
struct result_of<scalar_cmp_op<LhsScalar, RhsScalar, Cmp>(LhsScalar,RhsScalar)> {
typedef bool type;
};
template<typename LhsScalar, typename RhsScalar>
struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_EQ> : binary_op_base<LhsScalar,RhsScalar>
{
typedef bool result_type;
EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a==b;}
};
template<typename LhsScalar, typename RhsScalar>
struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_LT> : binary_op_base<LhsScalar,RhsScalar>
{
typedef bool result_type;
EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a<b;}
};
template<typename LhsScalar, typename RhsScalar>
struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_LE> : binary_op_base<LhsScalar,RhsScalar>
{
typedef bool result_type;
EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a<=b;}
};
template<typename LhsScalar, typename RhsScalar>
struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_GT> : binary_op_base<LhsScalar,RhsScalar>
{
typedef bool result_type;
EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a>b;}
};
template<typename LhsScalar, typename RhsScalar>
struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_GE> : binary_op_base<LhsScalar,RhsScalar>
{
typedef bool result_type;
EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a>=b;}
};
template<typename LhsScalar, typename RhsScalar>
struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_UNORD> : binary_op_base<LhsScalar,RhsScalar>
{
typedef bool result_type;
EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return !(a<=b || b<=a);}
};
template<typename LhsScalar, typename RhsScalar>
struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_NEQ> : binary_op_base<LhsScalar,RhsScalar>
{
typedef bool result_type;
EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a!=b;}
};
/** \internal
* \brief Template functor to compute the hypot of two \b positive \b and \b real scalars
*
* \sa MatrixBase::stableNorm(), class Redux
*/
template<typename Scalar>
struct scalar_hypot_op<Scalar,Scalar> : binary_op_base<Scalar,Scalar>
{
EIGEN_EMPTY_STRUCT_CTOR(scalar_hypot_op)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar &x, const Scalar &y) const
{
// This functor is used by hypotNorm only for which it is faster to first apply abs
// on all coefficients prior to reduction through hypot.
// This way we avoid calling abs on positive and real entries, and this also permits
// to seamlessly handle complexes. Otherwise we would have to handle both real and complexes
// through the same functor...
return internal::positive_real_hypot(x,y);
}
};
template<typename Scalar>
struct functor_traits<scalar_hypot_op<Scalar,Scalar> > {
enum
{
Cost = 3 * NumTraits<Scalar>::AddCost +
2 * NumTraits<Scalar>::MulCost +
2 * scalar_div_cost<Scalar,false>::value,
PacketAccess = false
};
};
/** \internal
* \brief Template functor to compute the pow of two scalars
*/
template<typename Scalar, typename Exponent>
struct scalar_pow_op : binary_op_base<Scalar,Exponent>
{
typedef typename ScalarBinaryOpTraits<Scalar,Exponent,scalar_pow_op>::ReturnType result_type;
#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
EIGEN_EMPTY_STRUCT_CTOR(scalar_pow_op)
#else
scalar_pow_op() {
typedef Scalar LhsScalar;
typedef Exponent RhsScalar;
EIGEN_SCALAR_BINARY_OP_PLUGIN
}
#endif
EIGEN_DEVICE_FUNC
inline result_type operator() (const Scalar& a, const Exponent& b) const { return numext::pow(a, b); }
};
template<typename Scalar, typename Exponent>
struct functor_traits<scalar_pow_op<Scalar,Exponent> > {
enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
};
//---------- non associative binary functors ----------
/** \internal
* \brief Template functor to compute the difference of two scalars
*
* \sa class CwiseBinaryOp, MatrixBase::operator-
*/
template<typename LhsScalar,typename RhsScalar>
struct scalar_difference_op : binary_op_base<LhsScalar,RhsScalar>
{
typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_difference_op>::ReturnType result_type;
#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
EIGEN_EMPTY_STRUCT_CTOR(scalar_difference_op)
#else
scalar_difference_op() {
EIGEN_SCALAR_BINARY_OP_PLUGIN
}
#endif
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a - b; }
template<typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
{ return internal::psub(a,b); }
};
template<typename LhsScalar,typename RhsScalar>
struct functor_traits<scalar_difference_op<LhsScalar,RhsScalar> > {
enum {
Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasSub && packet_traits<RhsScalar>::HasSub
};
};
/** \internal
* \brief Template functor to compute the quotient of two scalars
*
* \sa class CwiseBinaryOp, Cwise::operator/()
*/
template<typename LhsScalar,typename RhsScalar>
struct scalar_quotient_op : binary_op_base<LhsScalar,RhsScalar>
{
typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_quotient_op>::ReturnType result_type;
#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
EIGEN_EMPTY_STRUCT_CTOR(scalar_quotient_op)
#else
scalar_quotient_op() {
EIGEN_SCALAR_BINARY_OP_PLUGIN
}
#endif
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a / b; }
template<typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
{ return internal::pdiv(a,b); }
};
template<typename LhsScalar,typename RhsScalar>
struct functor_traits<scalar_quotient_op<LhsScalar,RhsScalar> > {
typedef typename scalar_quotient_op<LhsScalar,RhsScalar>::result_type result_type;
enum {
PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasDiv && packet_traits<RhsScalar>::HasDiv,
Cost = scalar_div_cost<result_type,PacketAccess>::value
};
};
/** \internal
* \brief Template functor to compute the and of two booleans
*
* \sa class CwiseBinaryOp, ArrayBase::operator&&
*/
struct scalar_boolean_and_op {
EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_and_op)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a && b; }
};
template<> struct functor_traits<scalar_boolean_and_op> {
enum {
Cost = NumTraits<bool>::AddCost,
PacketAccess = false
};
};
/** \internal
* \brief Template functor to compute the or of two booleans
*
* \sa class CwiseBinaryOp, ArrayBase::operator||
*/
struct scalar_boolean_or_op {
EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_or_op)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a || b; }
};
template<> struct functor_traits<scalar_boolean_or_op> {
enum {
Cost = NumTraits<bool>::AddCost,
PacketAccess = false
};
};
/** \internal
* \brief Template functor to compute the xor of two booleans
*
* \sa class CwiseBinaryOp, ArrayBase::operator^
*/
struct scalar_boolean_xor_op {
EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_xor_op)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a ^ b; }
};
template<> struct functor_traits<scalar_boolean_xor_op> {
enum {
Cost = NumTraits<bool>::AddCost,
PacketAccess = false
};
};
/** \internal
* \brief Template functor to compute the absolute difference of two scalars
*
* \sa class CwiseBinaryOp, MatrixBase::absolute_difference
*/
template<typename LhsScalar,typename RhsScalar>
struct scalar_absolute_difference_op : binary_op_base<LhsScalar,RhsScalar>
{
typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_absolute_difference_op>::ReturnType result_type;
#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
EIGEN_EMPTY_STRUCT_CTOR(scalar_absolute_difference_op)
#else
scalar_absolute_difference_op() {
EIGEN_SCALAR_BINARY_OP_PLUGIN
}
#endif
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const
{ return numext::absdiff(a,b); }
template<typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
{ return internal::pabsdiff(a,b); }
};
template<typename LhsScalar,typename RhsScalar>
struct functor_traits<scalar_absolute_difference_op<LhsScalar,RhsScalar> > {
enum {
Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasAbsDiff
};
};
//---------- binary functors bound to a constant, thus appearing as a unary functor ----------
// The following two classes permits to turn any binary functor into a unary one with one argument bound to a constant value.
// They are analogues to std::binder1st/binder2nd but with the following differences:
// - they are compatible with packetOp
// - they are portable across C++ versions (the std::binder* are deprecated in C++11)
template<typename BinaryOp> struct bind1st_op : BinaryOp {
typedef typename BinaryOp::first_argument_type first_argument_type;
typedef typename BinaryOp::second_argument_type second_argument_type;
typedef typename BinaryOp::result_type result_type;
EIGEN_DEVICE_FUNC explicit bind1st_op(const first_argument_type &val) : m_value(val) {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const second_argument_type& b) const { return BinaryOp::operator()(m_value,b); }
template<typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& b) const
{ return BinaryOp::packetOp(internal::pset1<Packet>(m_value), b); }
first_argument_type m_value;
};
template<typename BinaryOp> struct functor_traits<bind1st_op<BinaryOp> > : functor_traits<BinaryOp> {};
template<typename BinaryOp> struct bind2nd_op : BinaryOp {
typedef typename BinaryOp::first_argument_type first_argument_type;
typedef typename BinaryOp::second_argument_type second_argument_type;
typedef typename BinaryOp::result_type result_type;
EIGEN_DEVICE_FUNC explicit bind2nd_op(const second_argument_type &val) : m_value(val) {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const first_argument_type& a) const { return BinaryOp::operator()(a,m_value); }
template<typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
{ return BinaryOp::packetOp(a,internal::pset1<Packet>(m_value)); }
second_argument_type m_value;
};
template<typename BinaryOp> struct functor_traits<bind2nd_op<BinaryOp> > : functor_traits<BinaryOp> {};
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_BINARY_FUNCTORS_H
// clang-format on
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2009 Rohit Garg <rpg.314@gmail.com>
// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_GEOMETRY_SSE_H
#define EIGEN_GEOMETRY_SSE_H
namespace Eigen {
namespace internal {
template <class Derived, class OtherDerived>
struct quat_product<Architecture::SSE, Derived, OtherDerived, float> {
enum {
AAlignment = traits<Derived>::Alignment,
BAlignment = traits<OtherDerived>::Alignment,
ResAlignment = traits<Quaternion<float>>::Alignment
};
static inline Quaternion<float> run(const QuaternionBase<Derived>& _a,
const QuaternionBase<OtherDerived>& _b) {
evaluator<typename Derived::Coefficients> ae(_a.coeffs());
evaluator<typename OtherDerived::Coefficients> be(_b.coeffs());
Quaternion<float> res;
const __m128 mask = _mm_setr_ps(0.f, 0.f, 0.f, -0.f);
__m128 a = ae.template packet<AAlignment, __m128>(0);
__m128 b = be.template packet<BAlignment, __m128>(0);
__m128 s1 =
pmul(vec4f_swizzle1(a, 1, 2, 0, 2), vec4f_swizzle1(b, 2, 0, 1, 2));
__m128 s2 =
pmul(vec4f_swizzle1(a, 3, 3, 3, 1), vec4f_swizzle1(b, 0, 1, 2, 1));
pstoret<float, __m128, ResAlignment>(
&res.x(),
padd(psub(pmul(a, vec4f_swizzle1(b, 3, 3, 3, 3)),
pmul(vec4f_swizzle1(a, 2, 0, 1, 0),
vec4f_swizzle1(b, 1, 2, 0, 0))),
pxor(mask, padd(s1, s2))));
return res;
}
};
template <class Derived>
struct quat_conj<Architecture::SSE, Derived, float> {
enum { ResAlignment = traits<Quaternion<float>>::Alignment };
static inline Quaternion<float> run(const QuaternionBase<Derived>& q) {
evaluator<typename Derived::Coefficients> qe(q.coeffs());
Quaternion<float> res;
const Packet4f mask = _mm_setr_ps(-0.f, -0.f, -0.f, 0.f);
pstoret<float, Packet4f, ResAlignment>(
&res.x(),
pxor(mask,
qe.template packet<traits<Derived>::Alignment, Packet4f>(0)));
return res;
}
};
template <typename VectorLhs, typename VectorRhs>
struct cross3_impl<Architecture::SSE, VectorLhs, VectorRhs, float, true> {
enum {
ResAlignment =
traits<typename plain_matrix_type<VectorLhs>::type>::Alignment
};
static inline typename plain_matrix_type<VectorLhs>::type run(
const VectorLhs& lhs, const VectorRhs& rhs) {
evaluator<VectorLhs> lhs_eval(lhs);
evaluator<VectorRhs> rhs_eval(rhs);
__m128 a =
lhs_eval.template packet<traits<VectorLhs>::Alignment, __m128>(0);
__m128 b =
rhs_eval.template packet<traits<VectorRhs>::Alignment, __m128>(0);
__m128 mul1 =
pmul(vec4f_swizzle1(a, 1, 2, 0, 3), vec4f_swizzle1(b, 2, 0, 1, 3));
__m128 mul2 =
pmul(vec4f_swizzle1(a, 2, 0, 1, 3), vec4f_swizzle1(b, 1, 2, 0, 3));
typename plain_matrix_type<VectorLhs>::type res;
pstoret<float, __m128, ResAlignment>(&res.x(), psub(mul1, mul2));
return res;
}
};
template <class Derived, class OtherDerived>
struct quat_product<Architecture::SSE, Derived, OtherDerived, double> {
enum {
BAlignment = traits<OtherDerived>::Alignment,
ResAlignment = traits<Quaternion<double>>::Alignment
};
static inline Quaternion<double> run(const QuaternionBase<Derived>& _a,
const QuaternionBase<OtherDerived>& _b) {
const Packet2d mask =
_mm_castsi128_pd(_mm_set_epi32(0x0, 0x0, 0x80000000, 0x0));
Quaternion<double> res;
evaluator<typename Derived::Coefficients> ae(_a.coeffs());
evaluator<typename OtherDerived::Coefficients> be(_b.coeffs());
const double* a = _a.coeffs().data();
Packet2d b_xy = be.template packet<BAlignment, Packet2d>(0);
Packet2d b_zw = be.template packet<BAlignment, Packet2d>(2);
Packet2d a_xx = pset1<Packet2d>(a[0]);
Packet2d a_yy = pset1<Packet2d>(a[1]);
Packet2d a_zz = pset1<Packet2d>(a[2]);
Packet2d a_ww = pset1<Packet2d>(a[3]);
// two temporaries:
Packet2d t1, t2;
/*
* t1 = ww*xy + yy*zw
* t2 = zz*xy - xx*zw
* res.xy = t1 +/- swap(t2)
*/
t1 = padd(pmul(a_ww, b_xy), pmul(a_yy, b_zw));
t2 = psub(pmul(a_zz, b_xy), pmul(a_xx, b_zw));
#ifdef EIGEN_VECTORIZE_SSE3
EIGEN_UNUSED_VARIABLE(mask)
pstoret<double, Packet2d, ResAlignment>(&res.x(),
_mm_addsub_pd(t1, preverse(t2)));
#else
pstoret<double, Packet2d, ResAlignment>(&res.x(),
padd(t1, pxor(mask, preverse(t2))));
#endif
/*
* t1 = ww*zw - yy*xy
* t2 = zz*zw + xx*xy
* res.zw = t1 -/+ swap(t2) = swap( swap(t1) +/- t2)
*/
t1 = psub(pmul(a_ww, b_zw), pmul(a_yy, b_xy));
t2 = padd(pmul(a_zz, b_zw), pmul(a_xx, b_xy));
#ifdef EIGEN_VECTORIZE_SSE3
EIGEN_UNUSED_VARIABLE(mask)
pstoret<double, Packet2d, ResAlignment>(
&res.z(), preverse(_mm_addsub_pd(preverse(t1), t2)));
#else
pstoret<double, Packet2d, ResAlignment>(&res.z(),
psub(t1, pxor(mask, preverse(t2))));
#endif
return res;
}
};
template <class Derived>
struct quat_conj<Architecture::SSE, Derived, double> {
enum { ResAlignment = traits<Quaternion<double>>::Alignment };
static inline Quaternion<double> run(const QuaternionBase<Derived>& q) {
evaluator<typename Derived::Coefficients> qe(q.coeffs());
Quaternion<double> res;
const Packet2d mask0 = _mm_setr_pd(-0., -0.);
const Packet2d mask2 = _mm_setr_pd(-0., 0.);
pstoret<double, Packet2d, ResAlignment>(
&res.x(),
pxor(mask0,
qe.template packet<traits<Derived>::Alignment, Packet2d>(0)));
pstoret<double, Packet2d, ResAlignment>(
&res.z(),
pxor(mask2,
qe.template packet<traits<Derived>::Alignment, Packet2d>(2)));
return res;
}
};
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_GEOMETRY_SSE_H
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
//
// The conversion routines are Copyright (c) Fabian Giesen, 2016.
// The original license follows:
//
// Copyright (c) Fabian Giesen, 2016
// All rights reserved.
// Redistribution and use in source and binary forms, with or without
// modification, are permitted.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// Standard 16-bit float type, mostly useful for GPUs. Defines a new
// type Eigen::half (inheriting from CUDA's __half struct) with
// operator overloads such that it behaves basically as an arithmetic
// type. It will be quite slow on CPUs (so it is recommended to stay
// in fp32 for CPUs, except for simple parameter conversions, I/O
// to disk and the likes), but fast on GPUs.
#ifndef EIGEN_HALF_CUDA_H
#define EIGEN_HALF_CUDA_H
#if __cplusplus > 199711L
#define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type()
#else
#define EIGEN_EXPLICIT_CAST(tgt_type) operator tgt_type()
#endif
namespace Eigen {
struct half;
namespace half_impl {
#if !defined(EIGEN_HAS_CUDA_FP16)
// Make our own __half_raw definition that is similar to CUDA's.
struct __half_raw {
EIGEN_DEVICE_FUNC __half_raw() : x(0) {}
explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {}
unsigned short x;
};
#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
typedef __half __half_raw;
#endif
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw
raw_uint16_to_half(unsigned short x);
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff);
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h);
struct half_base : public __half_raw {
EIGEN_DEVICE_FUNC half_base() {}
EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {}
EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {}
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && \
EIGEN_CUDACC_VER >= 90000
EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
#endif
};
} // namespace half_impl
// Class definition.
struct half : public half_impl::half_base {
#if !defined(EIGEN_HAS_CUDA_FP16) || \
(defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000)
typedef half_impl::__half_raw __half_raw;
#endif
EIGEN_DEVICE_FUNC half() {}
EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {}
EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {}
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && \
EIGEN_CUDACC_VER >= 90000
EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
#endif
explicit EIGEN_DEVICE_FUNC half(bool b)
: half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
template <class T>
explicit EIGEN_DEVICE_FUNC half(const T& val)
: half_impl::half_base(
half_impl::float_to_half_rtne(static_cast<float>(val))) {}
explicit EIGEN_DEVICE_FUNC half(float f)
: half_impl::half_base(half_impl::float_to_half_rtne(f)) {}
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(bool) const {
// +0.0 and -0.0 become false, everything else becomes true.
return (x & 0x7fff) != 0;
}
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(signed char) const {
return static_cast<signed char>(half_impl::half_to_float(*this));
}
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned char) const {
return static_cast<unsigned char>(half_impl::half_to_float(*this));
}
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(short) const {
return static_cast<short>(half_impl::half_to_float(*this));
}
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned short) const {
return static_cast<unsigned short>(half_impl::half_to_float(*this));
}
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(int) const {
return static_cast<int>(half_impl::half_to_float(*this));
}
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned int) const {
return static_cast<unsigned int>(half_impl::half_to_float(*this));
}
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long) const {
return static_cast<long>(half_impl::half_to_float(*this));
}
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long) const {
return static_cast<unsigned long>(half_impl::half_to_float(*this));
}
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long long) const {
return static_cast<long long>(half_impl::half_to_float(*this));
}
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long long) const {
return static_cast<unsigned long long>(half_to_float(*this));
}
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(float) const {
return half_impl::half_to_float(*this);
}
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(double) const {
return static_cast<double>(half_impl::half_to_float(*this));
}
EIGEN_DEVICE_FUNC half& operator=(const half& other) {
x = other.x;
return *this;
}
};
namespace half_impl {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \
EIGEN_CUDA_ARCH >= 530
// Intrinsics for native fp16 support. Note that on current hardware,
// these are no faster than fp32 arithmetic (you need to use the half2
// versions to get the ALU speed increased), but you do save the
// conversion steps back and forth.
EIGEN_STRONG_INLINE __device__ half operator+(const half& a, const half& b) {
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
return __hadd(::__half(a), ::__half(b));
#else
return __hadd(a, b);
#endif
}
EIGEN_STRONG_INLINE __device__ half operator*(const half& a, const half& b) {
return __hmul(a, b);
}
EIGEN_STRONG_INLINE __device__ half operator-(const half& a, const half& b) {
return __hsub(a, b);
}
EIGEN_STRONG_INLINE __device__ half operator/(const half& a, const half& b) {
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
return __hdiv(a, b);
#else
float num = __half2float(a);
float denom = __half2float(b);
return __float2half(num / denom);
#endif
}
EIGEN_STRONG_INLINE __device__ half operator-(const half& a) {
return __hneg(a);
}
EIGEN_STRONG_INLINE __device__ half& operator+=(half& a, const half& b) {
a = a + b;
return a;
}
EIGEN_STRONG_INLINE __device__ half& operator*=(half& a, const half& b) {
a = a * b;
return a;
}
EIGEN_STRONG_INLINE __device__ half& operator-=(half& a, const half& b) {
a = a - b;
return a;
}
EIGEN_STRONG_INLINE __device__ half& operator/=(half& a, const half& b) {
a = a / b;
return a;
}
EIGEN_STRONG_INLINE __device__ bool operator==(const half& a, const half& b) {
return __heq(a, b);
}
EIGEN_STRONG_INLINE __device__ bool operator!=(const half& a, const half& b) {
return __hne(a, b);
}
EIGEN_STRONG_INLINE __device__ bool operator<(const half& a, const half& b) {
return __hlt(a, b);
}
EIGEN_STRONG_INLINE __device__ bool operator<=(const half& a, const half& b) {
return __hle(a, b);
}
EIGEN_STRONG_INLINE __device__ bool operator>(const half& a, const half& b) {
return __hgt(a, b);
}
EIGEN_STRONG_INLINE __device__ bool operator>=(const half& a, const half& b) {
return __hge(a, b);
}
#else // Emulate support for half floats
// Definitions for CPUs and older CUDA, mostly working through conversion
// to/from fp32.
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator+(const half& a,
const half& b) {
return half(float(a) + float(b));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator*(const half& a,
const half& b) {
return half(float(a) * float(b));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator-(const half& a,
const half& b) {
return half(float(a) - float(b));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator/(const half& a,
const half& b) {
return half(float(a) / float(b));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator-(const half& a) {
half result;
result.x = a.x ^ 0x8000;
return result;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator+=(half& a, const half& b) {
a = half(float(a) + float(b));
return a;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator*=(half& a, const half& b) {
a = half(float(a) * float(b));
return a;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator-=(half& a, const half& b) {
a = half(float(a) - float(b));
return a;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator/=(half& a, const half& b) {
a = half(float(a) / float(b));
return a;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator==(const half& a,
const half& b) {
return float(a) == float(b);
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator!=(const half& a,
const half& b) {
return float(a) != float(b);
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<(const half& a,
const half& b) {
return float(a) < float(b);
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<=(const half& a,
const half& b) {
return float(a) <= float(b);
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>(const half& a,
const half& b) {
return float(a) > float(b);
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(const half& a,
const half& b) {
return float(a) >= float(b);
}
#endif // Emulate support for half floats
// Division by an index. Do it in full float precision to avoid accuracy
// issues in converting the denominator to half.
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator/(const half& a, Index b) {
return half(static_cast<float>(a) / static_cast<float>(b));
}
// Conversion routines, including fallbacks for the host or older CUDA.
// Note that newer Intel CPUs (Haswell or newer) have vectorized versions of
// these in hardware. If we need more performance on older/other CPUs, they are
// also possible to vectorize directly.
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw
raw_uint16_to_half(unsigned short x) {
__half_raw h;
h.x = x;
return h;
}
union FP32 {
unsigned int u;
float f;
};
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \
EIGEN_CUDA_ARCH >= 300
__half tmp_ff = __float2half(ff);
return *(__half_raw*)&tmp_ff;
#elif defined(EIGEN_HAS_FP16_C)
__half_raw h;
h.x = _cvtss_sh(ff, 0);
return h;
#else
FP32 f;
f.f = ff;
const FP32 f32infty = {255 << 23};
const FP32 f16max = {(127 + 16) << 23};
const FP32 denorm_magic = {((127 - 15) + (23 - 10) + 1) << 23};
unsigned int sign_mask = 0x80000000u;
__half_raw o;
o.x = static_cast<unsigned short>(0x0u);
unsigned int sign = f.u & sign_mask;
f.u ^= sign;
// NOTE all the integer compares in this function can be safely
// compiled into signed compares since all operands are below
// 0x80000000. Important if you want fast straight SSE2 code
// (since there's no unsigned PCMPGTD).
if (f.u >= f16max.u) { // result is Inf or NaN (all exponent bits set)
o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
} else { // (De)normalized number or zero
if (f.u < (113 << 23)) { // resulting FP16 is subnormal or zero
// use a magic value to align our 10 mantissa bits at the bottom of
// the float. as long as FP addition is round-to-nearest-even this
// just works.
f.f += denorm_magic.f;
// and one integer subtract of the bias later, we have our final float!
o.x = static_cast<unsigned short>(f.u - denorm_magic.u);
} else {
unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd
// update exponent, rounding bias part 1
f.u += ((unsigned int)(15 - 127) << 23) + 0xfff;
// rounding bias part 2
f.u += mant_odd;
// take the bits!
o.x = static_cast<unsigned short>(f.u >> 13);
}
}
o.x |= static_cast<unsigned short>(sign >> 16);
return o;
#endif
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \
EIGEN_CUDA_ARCH >= 300
return __half2float(h);
#elif defined(EIGEN_HAS_FP16_C)
return _cvtsh_ss(h.x);
#else
const FP32 magic = {113 << 23};
const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift
FP32 o;
o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits
unsigned int exp = shifted_exp & o.u; // just the exponent
o.u += (127 - 15) << 23; // exponent adjust
// handle exponent special cases
if (exp == shifted_exp) { // Inf/NaN?
o.u += (128 - 16) << 23; // extra exp adjust
} else if (exp == 0) { // Zero/Denormal?
o.u += 1 << 23; // extra exp adjust
o.f -= magic.f; // renormalize
}
o.u |= (h.x & 0x8000) << 16; // sign bit
return o.f;
#endif
}
// --- standard functions ---
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isinf)(const half& a) {
return (a.x & 0x7fff) == 0x7c00;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isnan)(const half& a) {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \
EIGEN_CUDA_ARCH >= 530
return __hisnan(a);
#else
return (a.x & 0x7fff) > 0x7c00;
#endif
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isfinite)(const half& a) {
return !(isinf EIGEN_NOT_A_MACRO(a)) && !(isnan EIGEN_NOT_A_MACRO(a));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
half result;
result.x = a.x & 0x7FFF;
return result;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && \
EIGEN_CUDA_ARCH >= 530
return half(hexp(a));
#else
return half(::expf(float(a)));
#endif
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) {
return half(numext::expm1(float(a)));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && \
defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
return half(::hlog(a));
#else
return half(::logf(float(a)));
#endif
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log1p(const half& a) {
return half(numext::log1p(float(a)));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) {
return half(::log10f(float(a)));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && \
EIGEN_CUDA_ARCH >= 530
return half(hsqrt(a));
#else
return half(::sqrtf(float(a)));
#endif
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half pow(const half& a, const half& b) {
return half(::powf(float(a), float(b)));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sin(const half& a) {
return half(::sinf(float(a)));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half cos(const half& a) {
return half(::cosf(float(a)));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tan(const half& a) {
return half(::tanf(float(a)));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) {
return half(::tanhf(float(a)));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && \
EIGEN_CUDA_ARCH >= 300
return half(hfloor(a));
#else
return half(::floorf(float(a)));
#endif
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && \
EIGEN_CUDA_ARCH >= 300
return half(hceil(a));
#else
return half(::ceilf(float(a)));
#endif
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half(min)(const half& a, const half& b) {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \
EIGEN_CUDA_ARCH >= 530
return __hlt(b, a) ? b : a;
#else
const float f1 = static_cast<float>(a);
const float f2 = static_cast<float>(b);
return f2 < f1 ? b : a;
#endif
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half(max)(const half& a, const half& b) {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \
EIGEN_CUDA_ARCH >= 530
return __hlt(a, b) ? b : a;
#else
const float f1 = static_cast<float>(a);
const float f2 = static_cast<float>(b);
return f1 < f2 ? b : a;
#endif
}
EIGEN_ALWAYS_INLINE std::ostream& operator<<(std::ostream& os, const half& v) {
os << static_cast<float>(v);
return os;
}
} // end namespace half_impl
// import Eigen::half_impl::half into Eigen namespace
// using half_impl::half;
namespace internal {
template <>
struct random_default_impl<half, false, false> {
static inline half run(const half& x, const half& y) {
return x + (y - x) * half(float(std::rand()) / float(RAND_MAX));
}
static inline half run() { return run(half(-1.f), half(1.f)); }
};
template <>
struct is_arithmetic<half> {
enum { value = true };
};
} // end namespace internal
} // end namespace Eigen
namespace std {
template <>
struct numeric_limits<Eigen::half> {
static const bool is_specialized = true;
static const bool is_signed = true;
static const bool is_integer = false;
static const bool is_exact = false;
static const bool has_infinity = true;
static const bool has_quiet_NaN = true;
static const bool has_signaling_NaN = true;
static const float_denorm_style has_denorm = denorm_present;
static const bool has_denorm_loss = false;
static const std::float_round_style round_style = std::round_to_nearest;
static const bool is_iec559 = false;
static const bool is_bounded = false;
static const bool is_modulo = false;
static const int digits = 11;
static const int digits10 = 3; // according to
// http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
static const int max_digits10 = 5; // according to
// http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
static const int radix = 2;
static const int min_exponent = -13;
static const int min_exponent10 = -4;
static const int max_exponent = 16;
static const int max_exponent10 = 4;
static const bool traps = true;
static const bool tinyness_before = false;
static Eigen::half(min)() {
return Eigen::half_impl::raw_uint16_to_half(0x400);
}
static Eigen::half lowest() {
return Eigen::half_impl::raw_uint16_to_half(0xfbff);
}
static Eigen::half(max)() {
return Eigen::half_impl::raw_uint16_to_half(0x7bff);
}
static Eigen::half epsilon() {
return Eigen::half_impl::raw_uint16_to_half(0x0800);
}
static Eigen::half round_error() { return Eigen::half(0.5); }
static Eigen::half infinity() {
return Eigen::half_impl::raw_uint16_to_half(0x7c00);
}
static Eigen::half quiet_NaN() {
return Eigen::half_impl::raw_uint16_to_half(0x7e00);
}
static Eigen::half signaling_NaN() {
return Eigen::half_impl::raw_uint16_to_half(0x7e00);
}
static Eigen::half denorm_min() {
return Eigen::half_impl::raw_uint16_to_half(0x1);
}
};
}
namespace Eigen {
template <>
struct NumTraits<Eigen::half> : GenericNumTraits<Eigen::half> {
enum {
IsSigned = true,
IsInteger = false,
IsComplex = false,
RequireInitialization = false
};
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half epsilon() {
return half_impl::raw_uint16_to_half(0x0800);
}
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half dummy_precision() {
return Eigen::half(1e-2f);
}
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half highest() {
return half_impl::raw_uint16_to_half(0x7bff);
}
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half lowest() {
return half_impl::raw_uint16_to_half(0xfbff);
}
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half infinity() {
return half_impl::raw_uint16_to_half(0x7c00);
}
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() {
return half_impl::raw_uint16_to_half(0x7c01);
}
};
} // end namespace Eigen
// C-like standard mathematical functions and trancendentals.
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half fabsh(const Eigen::half& a) {
Eigen::half result;
result.x = a.x & 0x7FFF;
return result;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) {
return Eigen::half(::expf(float(a)));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) {
#if EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && \
EIGEN_CUDA_ARCH >= 530
return Eigen::half(::hlog(a));
#else
return Eigen::half(::logf(float(a)));
#endif
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sqrth(const Eigen::half& a) {
return Eigen::half(::sqrtf(float(a)));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half powh(const Eigen::half& a,
const Eigen::half& b) {
return Eigen::half(::powf(float(a), float(b)));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floorh(const Eigen::half& a) {
return Eigen::half(::floorf(float(a)));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ceilh(const Eigen::half& a) {
return Eigen::half(::ceilf(float(a)));
}
namespace std {
#if __cplusplus > 199711L
template <>
struct hash<Eigen::half> {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t operator()(
const Eigen::half& a) const {
return static_cast<std::size_t>(a.x);
}
};
#endif
} // end namespace std
// Add the missing shfl_xor intrinsic
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var,
int laneMask,
int width = warpSize) {
#if EIGEN_CUDACC_VER < 90000
return static_cast<Eigen::half>(
__shfl_xor(static_cast<float>(var), laneMask, width));
#else
return static_cast<Eigen::half>(
__shfl_xor_sync(0xFFFFFFFF, static_cast<float>(var), laneMask, width));
#endif
}
#endif
// ldg() has an overload for __half_raw, but we also need one for Eigen::half.
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(
const Eigen::half* ptr) {
return Eigen::half_impl::raw_uint16_to_half(
__ldg(reinterpret_cast<const unsigned short*>(ptr)));
}
#endif
#if defined(EIGEN_CUDA_ARCH)
namespace Eigen {
namespace numext {
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isnan)(const Eigen::half& h) {
return (half_impl::isnan)(h);
}
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isinf)(const Eigen::half& h) {
return (half_impl::isinf)(h);
}
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isfinite)(const Eigen::half& h) {
return (half_impl::isfinite)(h);
}
} // namespace Eigen
} // namespace numext
#endif
#endif // EIGEN_HALF_CUDA_H
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2006-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_MATHFUNCTIONS_H
#define EIGEN_MATHFUNCTIONS_H
// source: http://www.geom.uiuc.edu/~huberty/math5337/groupe/digits.html
// TODO this should better be moved to NumTraits
#define EIGEN_PI \
3.141592653589793238462643383279502884197169399375105820974944592307816406L
namespace Eigen {
// On WINCE, std::abs is defined for int only, so let's defined our own
// overloads:
// This issue has been confirmed with MSVC 2008 only, but the issue might exist
// for more recent versions too.
#if EIGEN_OS_WINCE && EIGEN_COMP_MSVC && EIGEN_COMP_MSVC <= 1500
long abs(long x) { return (labs(x)); }
double abs(double x) { return (fabs(x)); }
float abs(float x) { return (fabsf(x)); }
long double abs(long double x) { return (fabsl(x)); }
#endif
namespace internal {
/** \internal \class global_math_functions_filtering_base
*
* What it does:
* Defines a typedef 'type' as follows:
* - if type T has a member typedef
* Eigen_BaseClassForSpecializationOfGlobalMathFuncImpl, then
* global_math_functions_filtering_base<T>::type is a typedef for it.
* - otherwise, global_math_functions_filtering_base<T>::type is a typedef for
* T.
*
* How it's used:
* To allow to defined the global math functions (like sin...) in certain
* cases, like the Array expressions.
* When you do sin(array1+array2), the object array1+array2 has a complicated
* expression type, all what you want to know
* is that it inherits ArrayBase. So we implement a partial specialization of
* sin_impl for ArrayBase<Derived>.
* So we must make sure to use sin_impl<ArrayBase<Derived> > and not
* sin_impl<Derived>, otherwise our partial specialization
* won't be used. How does sin know that? That's exactly what
* global_math_functions_filtering_base tells it.
*
* How it's implemented:
* SFINAE in the style of enable_if. Highly susceptible of breaking compilers.
* With GCC, it sure does work, but if you replace
* the typename dummy by an integer template parameter, it doesn't work
* anymore!
*/
template <typename T, typename dummy = void>
struct global_math_functions_filtering_base {
typedef T type;
};
template <typename T>
struct always_void {
typedef void type;
};
template <typename T>
struct global_math_functions_filtering_base<
T,
typename always_void<
typename T::Eigen_BaseClassForSpecializationOfGlobalMathFuncImpl>::
type> {
typedef typename T::Eigen_BaseClassForSpecializationOfGlobalMathFuncImpl type;
};
#define EIGEN_MATHFUNC_IMPL(func, scalar) \
Eigen::internal::func##_impl< \
typename Eigen::internal::global_math_functions_filtering_base< \
scalar>::type>
#define EIGEN_MATHFUNC_RETVAL(func, scalar) \
typename Eigen::internal::func##_retval< \
typename Eigen::internal::global_math_functions_filtering_base< \
scalar>::type>::type
/****************************************************************************
* Implementation of real *
****************************************************************************/
template <typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
struct real_default_impl {
typedef typename NumTraits<Scalar>::Real RealScalar;
EIGEN_DEVICE_FUNC
static inline RealScalar run(const Scalar& x) { return x; }
};
template <typename Scalar>
struct real_default_impl<Scalar, true> {
typedef typename NumTraits<Scalar>::Real RealScalar;
EIGEN_DEVICE_FUNC
static inline RealScalar run(const Scalar& x) {
using std::real;
return real(x);
}
};
template <typename Scalar>
struct real_impl : real_default_impl<Scalar> {};
#if defined(EIGEN_GPU_COMPILE_PHASE)
template <typename T>
struct real_impl<std::complex<T>> {
typedef T RealScalar;
EIGEN_DEVICE_FUNC
static inline T run(const std::complex<T>& x) { return x.real(); }
};
#endif
template <typename Scalar>
struct real_retval {
typedef typename NumTraits<Scalar>::Real type;
};
/****************************************************************************
* Implementation of imag *
****************************************************************************/
template <typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
struct imag_default_impl {
typedef typename NumTraits<Scalar>::Real RealScalar;
EIGEN_DEVICE_FUNC
static inline RealScalar run(const Scalar&) { return RealScalar(0); }
};
template <typename Scalar>
struct imag_default_impl<Scalar, true> {
typedef typename NumTraits<Scalar>::Real RealScalar;
EIGEN_DEVICE_FUNC
static inline RealScalar run(const Scalar& x) {
using std::imag;
return imag(x);
}
};
template <typename Scalar>
struct imag_impl : imag_default_impl<Scalar> {};
#if defined(EIGEN_GPU_COMPILE_PHASE)
template <typename T>
struct imag_impl<std::complex<T>> {
typedef T RealScalar;
EIGEN_DEVICE_FUNC
static inline T run(const std::complex<T>& x) { return x.imag(); }
};
#endif
template <typename Scalar>
struct imag_retval {
typedef typename NumTraits<Scalar>::Real type;
};
/****************************************************************************
* Implementation of real_ref *
****************************************************************************/
template <typename Scalar>
struct real_ref_impl {
typedef typename NumTraits<Scalar>::Real RealScalar;
EIGEN_DEVICE_FUNC
static inline RealScalar& run(Scalar& x) {
return reinterpret_cast<RealScalar*>(&x)[0];
}
EIGEN_DEVICE_FUNC
static inline const RealScalar& run(const Scalar& x) {
return reinterpret_cast<const RealScalar*>(&x)[0];
}
};
template <typename Scalar>
struct real_ref_retval {
typedef typename NumTraits<Scalar>::Real& type;
};
/****************************************************************************
* Implementation of imag_ref *
****************************************************************************/
template <typename Scalar, bool IsComplex>
struct imag_ref_default_impl {
typedef typename NumTraits<Scalar>::Real RealScalar;
EIGEN_DEVICE_FUNC
static inline RealScalar& run(Scalar& x) {
return reinterpret_cast<RealScalar*>(&x)[1];
}
EIGEN_DEVICE_FUNC
static inline const RealScalar& run(const Scalar& x) {
return reinterpret_cast<RealScalar*>(&x)[1];
}
};
template <typename Scalar>
struct imag_ref_default_impl<Scalar, false> {
EIGEN_DEVICE_FUNC
static inline Scalar run(Scalar&) { return Scalar(0); }
EIGEN_DEVICE_FUNC
static inline const Scalar run(const Scalar&) { return Scalar(0); }
};
template <typename Scalar>
struct imag_ref_impl
: imag_ref_default_impl<Scalar, NumTraits<Scalar>::IsComplex> {};
template <typename Scalar>
struct imag_ref_retval {
typedef typename NumTraits<Scalar>::Real& type;
};
/****************************************************************************
* Implementation of conj *
****************************************************************************/
template <typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
struct conj_default_impl {
EIGEN_DEVICE_FUNC
static inline Scalar run(const Scalar& x) { return x; }
};
template <typename Scalar>
struct conj_default_impl<Scalar, true> {
EIGEN_DEVICE_FUNC
static inline Scalar run(const Scalar& x) {
using std::conj;
return conj(x);
}
};
template <typename Scalar>
struct conj_impl : conj_default_impl<Scalar> {};
#if defined(EIGEN_GPU_COMPILE_PHASE)
template <typename T>
struct conj_impl<std::complex<T>> {
EIGEN_DEVICE_FUNC
static inline std::complex<T> run(const std::complex<T>& x) {
return std::complex<T>(x.real(), -x.imag());
}
};
#endif
template <typename Scalar>
struct conj_retval {
typedef Scalar type;
};
/****************************************************************************
* Implementation of abs2 *
****************************************************************************/
template <typename Scalar, bool IsComplex>
struct abs2_impl_default {
typedef typename NumTraits<Scalar>::Real RealScalar;
EIGEN_DEVICE_FUNC
static inline RealScalar run(const Scalar& x) { return x * x; }
};
template <typename Scalar>
struct abs2_impl_default<Scalar, true> // IsComplex
{
typedef typename NumTraits<Scalar>::Real RealScalar;
EIGEN_DEVICE_FUNC
static inline RealScalar run(const Scalar& x) {
return x.real() * x.real() + x.imag() * x.imag();
}
};
template <typename Scalar>
struct abs2_impl {
typedef typename NumTraits<Scalar>::Real RealScalar;
EIGEN_DEVICE_FUNC
static inline RealScalar run(const Scalar& x) {
return abs2_impl_default<Scalar, NumTraits<Scalar>::IsComplex>::run(x);
}
};
template <typename Scalar>
struct abs2_retval {
typedef typename NumTraits<Scalar>::Real type;
};
/****************************************************************************
* Implementation of norm1 *
****************************************************************************/
template <typename Scalar, bool IsComplex>
struct norm1_default_impl;
template <typename Scalar>
struct norm1_default_impl<Scalar, true> {
typedef typename NumTraits<Scalar>::Real RealScalar;
EIGEN_DEVICE_FUNC
static inline RealScalar run(const Scalar& x) {
EIGEN_USING_STD_MATH(abs);
return abs(x.real()) + abs(x.imag());
}
};
template <typename Scalar>
struct norm1_default_impl<Scalar, false> {
EIGEN_DEVICE_FUNC
static inline Scalar run(const Scalar& x) {
EIGEN_USING_STD_MATH(abs);
return abs(x);
}
};
template <typename Scalar>
struct norm1_impl : norm1_default_impl<Scalar, NumTraits<Scalar>::IsComplex> {};
template <typename Scalar>
struct norm1_retval {
typedef typename NumTraits<Scalar>::Real type;
};
/****************************************************************************
* Implementation of hypot *
****************************************************************************/
template <typename Scalar>
struct hypot_impl;
template <typename Scalar>
struct hypot_retval {
typedef typename NumTraits<Scalar>::Real type;
};
/****************************************************************************
* Implementation of cast *
****************************************************************************/
template <typename OldType, typename NewType>
struct cast_impl {
EIGEN_DEVICE_FUNC
static inline NewType run(const OldType& x) {
return static_cast<NewType>(x);
}
};
// here, for once, we're plainly returning NewType: we don't want cast to do
// weird things.
template <typename OldType, typename NewType>
EIGEN_DEVICE_FUNC inline NewType cast(const OldType& x) {
return cast_impl<OldType, NewType>::run(x);
}
/****************************************************************************
* Implementation of round *
****************************************************************************/
#if EIGEN_HAS_CXX11_MATH
template <typename Scalar>
struct round_impl {
EIGEN_DEVICE_FUNC
static inline Scalar run(const Scalar& x) {
EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex),
NUMERIC_TYPE_MUST_BE_REAL)
EIGEN_USING_STD_MATH(round);
return round(x);
}
};
#else
template <typename Scalar>
struct round_impl {
EIGEN_DEVICE_FUNC
static inline Scalar run(const Scalar& x) {
EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex),
NUMERIC_TYPE_MUST_BE_REAL)
EIGEN_USING_STD_MATH(floor);
EIGEN_USING_STD_MATH(ceil);
return (x > Scalar(0)) ? floor(x + Scalar(0.5)) : ceil(x - Scalar(0.5));
}
};
#endif
template <typename Scalar>
struct round_retval {
typedef Scalar type;
};
/****************************************************************************
* Implementation of rint *
****************************************************************************/
template <typename Scalar>
struct rint_impl {
EIGEN_DEVICE_FUNC
static inline Scalar run(const Scalar& x) {
EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex),
NUMERIC_TYPE_MUST_BE_REAL)
#if EIGEN_HAS_CXX11_MATH
EIGEN_USING_STD_MATH(rint);
#endif
return rint(x);
}
};
#if !EIGEN_HAS_CXX11_MATH
template <>
struct rint_impl<double> {
EIGEN_DEVICE_FUNC
static inline double run(const double& x) { return ::rint(x); }
};
template <>
struct rint_impl<float> {
EIGEN_DEVICE_FUNC
static inline float run(const float& x) { return ::rintf(x); }
};
#endif
template <typename Scalar>
struct rint_retval {
typedef Scalar type;
};
/****************************************************************************
* Implementation of arg *
****************************************************************************/
#if EIGEN_HAS_CXX11_MATH
template <typename Scalar>
struct arg_impl {
EIGEN_DEVICE_FUNC
static inline Scalar run(const Scalar& x) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
// HIP does not seem to have a native device side implementation for the
// math routine "arg"
using std::arg;
#else
EIGEN_USING_STD_MATH(arg);
#endif
return arg(x);
}
};
#else
template <typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
struct arg_default_impl {
typedef typename NumTraits<Scalar>::Real RealScalar;
EIGEN_DEVICE_FUNC
static inline RealScalar run(const Scalar& x) {
return (x < Scalar(0)) ? Scalar(EIGEN_PI) : Scalar(0);
}
};
template <typename Scalar>
struct arg_default_impl<Scalar, true> {
typedef typename NumTraits<Scalar>::Real RealScalar;
EIGEN_DEVICE_FUNC
static inline RealScalar run(const Scalar& x) {
EIGEN_USING_STD_MATH(arg);
return arg(x);
}
};
template <typename Scalar>
struct arg_impl : arg_default_impl<Scalar> {};
#endif
template <typename Scalar>
struct arg_retval {
typedef typename NumTraits<Scalar>::Real type;
};
/****************************************************************************
* Implementation of expm1 *
****************************************************************************/
// This implementation is based on GSL Math's expm1.
namespace std_fallback {
// fallback expm1 implementation in case there is no expm1(Scalar) function in
// namespace of Scalar,
// or that there is no suitable std::expm1 function available. Implementation
// attributed to Kahan. See: http://www.plunk.org/~hatch/rightway.php.
template <typename Scalar>
EIGEN_DEVICE_FUNC inline Scalar expm1(const Scalar& x) {
EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
typedef typename NumTraits<Scalar>::Real RealScalar;
EIGEN_USING_STD_MATH(exp);
Scalar u = exp(x);
if (numext::equal_strict(u, Scalar(1))) {
return x;
}
Scalar um1 = u - RealScalar(1);
if (numext::equal_strict(um1, Scalar(-1))) {
return RealScalar(-1);
}
EIGEN_USING_STD_MATH(log);
Scalar logu = log(u);
return numext::equal_strict(u, logu) ? u : (u - RealScalar(1)) * x / logu;
}
}
template <typename Scalar>
struct expm1_impl {
EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) {
EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
#if EIGEN_HAS_CXX11_MATH
using std::expm1;
#else
using std_fallback::expm1;
#endif
return expm1(x);
}
};
// Specialization for complex types that are not supported by std::expm1.
template <typename RealScalar>
struct expm1_impl<std::complex<RealScalar>> {
EIGEN_DEVICE_FUNC static inline std::complex<RealScalar> run(
const std::complex<RealScalar>& x) {
EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar)
RealScalar xr = x.real();
RealScalar xi = x.imag();
// expm1(z) = exp(z) - 1
// = exp(x + i * y) - 1
// = exp(x) * (cos(y) + i * sin(y)) - 1
// = exp(x) * cos(y) - 1 + i * exp(x) * sin(y)
// Imag(expm1(z)) = exp(x) * sin(y)
// Real(expm1(z)) = exp(x) * cos(y) - 1
// = exp(x) * cos(y) - 1.
// = expm1(x) + exp(x) * (cos(y) - 1)
// = expm1(x) + exp(x) * (2 * sin(y / 2) ** 2)
// TODO better use numext::expm1 and numext::sin (but that would require
// forward declarations or moving this specialization down).
RealScalar erm1 = expm1_impl<RealScalar>::run(xr);
RealScalar er = erm1 + RealScalar(1.);
EIGEN_USING_STD_MATH(sin);
RealScalar sin2 = sin(xi / RealScalar(2.));
sin2 = sin2 * sin2;
RealScalar s = sin(xi);
RealScalar real_part = erm1 - RealScalar(2.) * er * sin2;
return std::complex<RealScalar>(real_part, er * s);
}
};
template <typename Scalar>
struct expm1_retval {
typedef Scalar type;
};
/****************************************************************************
* Implementation of log1p *
****************************************************************************/
namespace std_fallback {
// fallback log1p implementation in case there is no log1p(Scalar) function in
// namespace of Scalar,
// or that there is no suitable std::log1p function available
template <typename Scalar>
EIGEN_DEVICE_FUNC inline Scalar log1p(const Scalar& x) {
EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
typedef typename NumTraits<Scalar>::Real RealScalar;
EIGEN_USING_STD_MATH(log);
Scalar x1p = RealScalar(1) + x;
Scalar log_1p = log(x1p);
const bool is_small = numext::equal_strict(x1p, Scalar(1));
const bool is_inf = numext::equal_strict(x1p, log_1p);
return (is_small || is_inf) ? x : x * (log_1p / (x1p - RealScalar(1)));
}
}
template <typename Scalar>
struct log1p_impl {
EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) {
EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
#if EIGEN_HAS_CXX11_MATH
using std::log1p;
#else
using std_fallback::log1p;
#endif
return log1p(x);
}
};
// Specialization for complex types that are not supported by std::log1p.
template <typename RealScalar>
struct log1p_impl<std::complex<RealScalar>> {
EIGEN_DEVICE_FUNC static inline std::complex<RealScalar> run(
const std::complex<RealScalar>& x) {
EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar)
return std_fallback::log1p(x);
}
};
template <typename Scalar>
struct log1p_retval {
typedef Scalar type;
};
/****************************************************************************
* Implementation of pow *
****************************************************************************/
template <typename ScalarX,
typename ScalarY,
bool IsInteger =
NumTraits<ScalarX>::IsInteger&& NumTraits<ScalarY>::IsInteger>
struct pow_impl {
// typedef Scalar retval;
typedef typename ScalarBinaryOpTraits<
ScalarX,
ScalarY,
internal::scalar_pow_op<ScalarX, ScalarY>>::ReturnType result_type;
static EIGEN_DEVICE_FUNC inline result_type run(const ScalarX& x,
const ScalarY& y) {
EIGEN_USING_STD_MATH(pow);
return pow(x, y);
}
};
template <typename ScalarX, typename ScalarY>
struct pow_impl<ScalarX, ScalarY, true> {
typedef ScalarX result_type;
static EIGEN_DEVICE_FUNC inline ScalarX run(ScalarX x, ScalarY y) {
ScalarX res(1);
eigen_assert(!NumTraits<ScalarY>::IsSigned || y >= 0);
if (y & 1) res *= x;
y >>= 1;
while (y) {
x *= x;
if (y & 1) res *= x;
y >>= 1;
}
return res;
}
};
/****************************************************************************
* Implementation of random *
****************************************************************************/
template <typename Scalar, bool IsComplex, bool IsInteger>
struct random_default_impl {};
template <typename Scalar>
struct random_impl : random_default_impl<Scalar,
NumTraits<Scalar>::IsComplex,
NumTraits<Scalar>::IsInteger> {};
template <typename Scalar>
struct random_retval {
typedef Scalar type;
};
template <typename Scalar>
inline EIGEN_MATHFUNC_RETVAL(random, Scalar)
random(const Scalar& x, const Scalar& y);
template <typename Scalar>
inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random();
template <typename Scalar>
struct random_default_impl<Scalar, false, false> {
static inline Scalar run(const Scalar& x, const Scalar& y) {
return x + (y - x) * Scalar(std::rand()) / Scalar(RAND_MAX);
}
static inline Scalar run() {
return run(Scalar(NumTraits<Scalar>::IsSigned ? -1 : 0), Scalar(1));
}
};
enum {
meta_floor_log2_terminate,
meta_floor_log2_move_up,
meta_floor_log2_move_down,
meta_floor_log2_bogus
};
template <unsigned int n, int lower, int upper>
struct meta_floor_log2_selector {
enum {
middle = (lower + upper) / 2,
value = (upper <= lower + 1)
? int(meta_floor_log2_terminate)
: (n < (1 << middle)) ? int(meta_floor_log2_move_down)
: (n == 0) ? int(meta_floor_log2_bogus)
: int(meta_floor_log2_move_up)
};
};
template <unsigned int n,
int lower = 0,
int upper = sizeof(unsigned int) * CHAR_BIT - 1,
int selector = meta_floor_log2_selector<n, lower, upper>::value>
struct meta_floor_log2 {};
template <unsigned int n, int lower, int upper>
struct meta_floor_log2<n, lower, upper, meta_floor_log2_move_down> {
enum {
value = meta_floor_log2<
n,
lower,
meta_floor_log2_selector<n, lower, upper>::middle>::value
};
};
template <unsigned int n, int lower, int upper>
struct meta_floor_log2<n, lower, upper, meta_floor_log2_move_up> {
enum {
value = meta_floor_log2<n,
meta_floor_log2_selector<n, lower, upper>::middle,
upper>::value
};
};
template <unsigned int n, int lower, int upper>
struct meta_floor_log2<n, lower, upper, meta_floor_log2_terminate> {
enum {
value = (n >= ((unsigned int)(1) << (lower + 1))) ? lower + 1 : lower
};
};
template <unsigned int n, int lower, int upper>
struct meta_floor_log2<n, lower, upper, meta_floor_log2_bogus> {
// no value, error at compile time
};
template <typename Scalar>
struct random_default_impl<Scalar, false, true> {
static inline Scalar run(const Scalar& x, const Scalar& y) {
if (y <= x) return x;
// ScalarU is the unsigned counterpart of Scalar, possibly Scalar itself.
typedef typename make_unsigned<Scalar>::type ScalarU;
// ScalarX is the widest of ScalarU and unsigned int.
// We'll deal only with ScalarX and unsigned int below thus avoiding signed
// types and arithmetic and signed overflows (which are undefined behavior).
typedef typename conditional<(ScalarU(-1) > unsigned(-1)),
ScalarU,
unsigned>::type ScalarX;
// The following difference doesn't overflow, provided our integer types are
// two's
// complement and have the same number of padding bits in signed and
// unsigned variants.
// This is the case in most modern implementations of C++.
ScalarX range = ScalarX(y) - ScalarX(x);
ScalarX offset = 0;
ScalarX divisor = 1;
ScalarX multiplier = 1;
const unsigned rand_max = RAND_MAX;
if (range <= rand_max)
divisor = (rand_max + 1) / (range + 1);
else
multiplier = 1 + range / (rand_max + 1);
// Rejection sampling.
do {
offset = (unsigned(std::rand()) * multiplier) / divisor;
} while (offset > range);
return Scalar(ScalarX(x) + offset);
}
static inline Scalar run() {
#ifdef EIGEN_MAKING_DOCS
return run(Scalar(NumTraits<Scalar>::IsSigned ? -10 : 0), Scalar(10));
#else
enum {
rand_bits = meta_floor_log2<(unsigned int)(RAND_MAX) + 1>::value,
scalar_bits = sizeof(Scalar) * CHAR_BIT,
shift = EIGEN_PLAIN_ENUM_MAX(0, int(rand_bits) - int(scalar_bits)),
offset = NumTraits<Scalar>::IsSigned
? (1 << (EIGEN_PLAIN_ENUM_MIN(rand_bits, scalar_bits) - 1))
: 0};
return Scalar((std::rand() >> shift) - offset);
#endif
}
};
template <typename Scalar>
struct random_default_impl<Scalar, true, false> {
static inline Scalar run(const Scalar& x, const Scalar& y) {
return Scalar(random(x.real(), y.real()), random(x.imag(), y.imag()));
}
static inline Scalar run() {
typedef typename NumTraits<Scalar>::Real RealScalar;
return Scalar(random<RealScalar>(), random<RealScalar>());
}
};
template <typename Scalar>
inline EIGEN_MATHFUNC_RETVAL(random, Scalar)
random(const Scalar& x, const Scalar& y) {
return EIGEN_MATHFUNC_IMPL(random, Scalar)::run(x, y);
}
template <typename Scalar>
inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random() {
return EIGEN_MATHFUNC_IMPL(random, Scalar)::run();
}
// Implementation of is* functions
// std::is* do not work with fast-math and gcc, std::is* are available on MSVC
// 2013 and newer, as well as in clang.
#if (EIGEN_HAS_CXX11_MATH && \
!(EIGEN_COMP_GNUC_STRICT && __FINITE_MATH_ONLY__)) || \
(EIGEN_COMP_MSVC >= 1800) || (EIGEN_COMP_CLANG)
#define EIGEN_USE_STD_FPCLASSIFY 1
#else
#define EIGEN_USE_STD_FPCLASSIFY 0
#endif
template <typename T>
EIGEN_DEVICE_FUNC
typename internal::enable_if<internal::is_integral<T>::value, bool>::type
isnan_impl(const T&) {
return false;
}
template <typename T>
EIGEN_DEVICE_FUNC
typename internal::enable_if<internal::is_integral<T>::value, bool>::type
isinf_impl(const T&) {
return false;
}
template <typename T>
EIGEN_DEVICE_FUNC
typename internal::enable_if<internal::is_integral<T>::value, bool>::type
isfinite_impl(const T&) {
return true;
}
template <typename T>
EIGEN_DEVICE_FUNC
typename internal::enable_if<(!internal::is_integral<T>::value) &&
(!NumTraits<T>::IsComplex),
bool>::type
isfinite_impl(const T& x) {
#if defined(EIGEN_GPU_COMPILE_PHASE)
return (::isfinite)(x);
#elif EIGEN_USE_STD_FPCLASSIFY
using std::isfinite;
return isfinite EIGEN_NOT_A_MACRO(x);
#else
return x <= NumTraits<T>::highest() && x >= NumTraits<T>::lowest();
#endif
}
template <typename T>
EIGEN_DEVICE_FUNC
typename internal::enable_if<(!internal::is_integral<T>::value) &&
(!NumTraits<T>::IsComplex),
bool>::type
isinf_impl(const T& x) {
#if defined(EIGEN_GPU_COMPILE_PHASE)
return (::isinf)(x);
#elif EIGEN_USE_STD_FPCLASSIFY
using std::isinf;
return isinf EIGEN_NOT_A_MACRO(x);
#else
return x > NumTraits<T>::highest() || x < NumTraits<T>::lowest();
#endif
}
template <typename T>
EIGEN_DEVICE_FUNC
typename internal::enable_if<(!internal::is_integral<T>::value) &&
(!NumTraits<T>::IsComplex),
bool>::type
isnan_impl(const T& x) {
#if defined(EIGEN_GPU_COMPILE_PHASE)
return (::isnan)(x);
#elif EIGEN_USE_STD_FPCLASSIFY
using std::isnan;
return isnan EIGEN_NOT_A_MACRO(x);
#else
return x != x;
#endif
}
#if (!EIGEN_USE_STD_FPCLASSIFY)
#if EIGEN_COMP_MSVC
template <typename T>
EIGEN_DEVICE_FUNC bool isinf_msvc_helper(T x) {
return _fpclass(x) == _FPCLASS_NINF || _fpclass(x) == _FPCLASS_PINF;
}
// MSVC defines a _isnan builtin function, but for double only
EIGEN_DEVICE_FUNC inline bool isnan_impl(const long double& x) {
return _isnan(x) != 0;
}
EIGEN_DEVICE_FUNC inline bool isnan_impl(const double& x) {
return _isnan(x) != 0;
}
EIGEN_DEVICE_FUNC inline bool isnan_impl(const float& x) {
return _isnan(x) != 0;
}
EIGEN_DEVICE_FUNC inline bool isinf_impl(const long double& x) {
return isinf_msvc_helper(x);
}
EIGEN_DEVICE_FUNC inline bool isinf_impl(const double& x) {
return isinf_msvc_helper(x);
}
EIGEN_DEVICE_FUNC inline bool isinf_impl(const float& x) {
return isinf_msvc_helper(x);
}
#elif (defined __FINITE_MATH_ONLY__ && __FINITE_MATH_ONLY__ && EIGEN_COMP_GNUC)
#if EIGEN_GNUC_AT_LEAST(5, 0)
#define EIGEN_TMP_NOOPT_ATTRIB \
EIGEN_DEVICE_FUNC inline __attribute__((optimize("no-finite-math-only")))
#else
// NOTE the inline qualifier and noinline attribute are both needed: the former
// is to avoid linking issue (duplicate symbol),
// while the second prevent too aggressive optimizations in fast-math mode:
#define EIGEN_TMP_NOOPT_ATTRIB \
EIGEN_DEVICE_FUNC inline \
__attribute__((noinline, optimize("no-finite-math-only")))
#endif
template <>
EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const long double& x) {
return __builtin_isnan(x);
}
template <>
EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const double& x) {
return __builtin_isnan(x);
}
template <>
EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const float& x) {
return __builtin_isnan(x);
}
template <>
EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const double& x) {
return __builtin_isinf(x);
}
template <>
EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const float& x) {
return __builtin_isinf(x);
}
template <>
EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const long double& x) {
return __builtin_isinf(x);
}
#undef EIGEN_TMP_NOOPT_ATTRIB
#endif
#endif
// The following overload are defined at the end of this file
template <typename T>
EIGEN_DEVICE_FUNC bool isfinite_impl(const std::complex<T>& x);
template <typename T>
EIGEN_DEVICE_FUNC bool isnan_impl(const std::complex<T>& x);
template <typename T>
EIGEN_DEVICE_FUNC bool isinf_impl(const std::complex<T>& x);
template <typename T>
T generic_fast_tanh_float(const T& a_x);
} // end namespace internal
/****************************************************************************
* Generic math functions *
****************************************************************************/
namespace numext {
#if (!defined(EIGEN_GPUCC))
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y) {
EIGEN_USING_STD_MATH(min);
return min EIGEN_NOT_A_MACRO(x, y);
}
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y) {
EIGEN_USING_STD_MATH(max);
return max EIGEN_NOT_A_MACRO(x, y);
}
#else
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y) {
return y < x ? y : x;
}
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float mini(const float& x,
const float& y) {
return fminf(x, y);
}
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double mini(const double& x,
const double& y) {
return fmin(x, y);
}
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE long double mini(const long double& x,
const long double& y) {
#if defined(EIGEN_HIPCC)
// no "fminl" on HIP yet
return (x < y) ? x : y;
#else
return fminl(x, y);
#endif
}
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y) {
return x < y ? y : x;
}
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float maxi(const float& x,
const float& y) {
return fmaxf(x, y);
}
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double maxi(const double& x,
const double& y) {
return fmax(x, y);
}
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE long double maxi(const long double& x,
const long double& y) {
#if defined(EIGEN_HIPCC)
// no "fmaxl" on HIP yet
return (x > y) ? x : y;
#else
return fmaxl(x, y);
#endif
}
#endif
#if defined(SYCL_DEVICE_ONLY)
#define SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_char) \
SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_short) \
SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_int) \
SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_long)
#define SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_char) \
SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_short) \
SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_int) \
SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_long)
#define SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_uchar) \
SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_ushort) \
SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_uint) \
SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_ulong)
#define SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_uchar) \
SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_ushort) \
SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_uint) \
SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_ulong)
#define SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(NAME, FUNC) \
SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY(NAME, FUNC)
#define SYCL_SPECIALIZE_INTEGER_TYPES_UNARY(NAME, FUNC) \
SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY(NAME, FUNC)
#define SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(NAME, FUNC) \
SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_float) \
SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_double)
#define SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(NAME, FUNC) \
SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_float) \
SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_double)
#define SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE( \
NAME, FUNC, RET_TYPE) \
SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, cl::sycl::cl_float) \
SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, cl::sycl::cl_double)
#define SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE) \
template <> \
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE RET_TYPE NAME(const ARG_TYPE& x) { \
return cl::sycl::FUNC(x); \
}
#define SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, TYPE) \
SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, TYPE, TYPE)
#define SYCL_SPECIALIZE_GEN1_BINARY_FUNC( \
NAME, FUNC, RET_TYPE, ARG_TYPE1, ARG_TYPE2) \
template <> \
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE RET_TYPE NAME(const ARG_TYPE1& x, \
const ARG_TYPE2& y) { \
return cl::sycl::FUNC(x, y); \
}
#define SYCL_SPECIALIZE_GEN2_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE) \
SYCL_SPECIALIZE_GEN1_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE, ARG_TYPE)
#define SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, TYPE) \
SYCL_SPECIALIZE_GEN2_BINARY_FUNC(NAME, FUNC, TYPE, TYPE)
SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(mini, min)
SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(mini, fmin)
SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(maxi, max)
SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(maxi, fmax)
#endif
template <typename Scalar>
EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(real, Scalar)
real(const Scalar& x) {
return EIGEN_MATHFUNC_IMPL(real, Scalar)::run(x);
}
template <typename Scalar>
EIGEN_DEVICE_FUNC inline typename internal::add_const_on_value_type<
EIGEN_MATHFUNC_RETVAL(real_ref, Scalar)>::type
real_ref(const Scalar& x) {
return internal::real_ref_impl<Scalar>::run(x);
}
template <typename Scalar>
EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(real_ref, Scalar)
real_ref(Scalar& x) {
return EIGEN_MATHFUNC_IMPL(real_ref, Scalar)::run(x);
}
template <typename Scalar>
EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(imag, Scalar)
imag(const Scalar& x) {
return EIGEN_MATHFUNC_IMPL(imag, Scalar)::run(x);
}
template <typename Scalar>
EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(arg, Scalar)
arg(const Scalar& x) {
return EIGEN_MATHFUNC_IMPL(arg, Scalar)::run(x);
}
template <typename Scalar>
EIGEN_DEVICE_FUNC inline typename internal::add_const_on_value_type<
EIGEN_MATHFUNC_RETVAL(imag_ref, Scalar)>::type
imag_ref(const Scalar& x) {
return internal::imag_ref_impl<Scalar>::run(x);
}
template <typename Scalar>
EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(imag_ref, Scalar)
imag_ref(Scalar& x) {
return EIGEN_MATHFUNC_IMPL(imag_ref, Scalar)::run(x);
}
template <typename Scalar>
EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(conj, Scalar)
conj(const Scalar& x) {
return EIGEN_MATHFUNC_IMPL(conj, Scalar)::run(x);
}
template <typename Scalar>
EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(abs2, Scalar)
abs2(const Scalar& x) {
return EIGEN_MATHFUNC_IMPL(abs2, Scalar)::run(x);
}
EIGEN_DEVICE_FUNC
inline bool abs2(bool x) { return x; }
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T absdiff(const T& x, const T& y) {
return x > y ? x - y : y - x;
}
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float absdiff(const float& x,
const float& y) {
return fabsf(x - y);
}
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double absdiff(const double& x,
const double& y) {
return fabs(x - y);
}
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE long double absdiff(
const long double& x, const long double& y) {
#if defined(EIGEN_HIPCC)
// no "fabsl" on HIP yet
return (x > y) ? x : y;
#else
return fabsl(x - y);
#endif
}
template <typename Scalar>
EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(norm1, Scalar)
norm1(const Scalar& x) {
return EIGEN_MATHFUNC_IMPL(norm1, Scalar)::run(x);
}
template <typename Scalar>
EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(hypot, Scalar)
hypot(const Scalar& x, const Scalar& y) {
return EIGEN_MATHFUNC_IMPL(hypot, Scalar)::run(x, y);
}
#if defined(SYCL_DEVICE_ONLY)
SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(hypot, hypot)
#endif
template <typename Scalar>
EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar)
log1p(const Scalar& x) {
return EIGEN_MATHFUNC_IMPL(log1p, Scalar)::run(x);
}
#if defined(SYCL_DEVICE_ONLY)
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(log1p, log1p)
#endif
#if defined(EIGEN_GPUCC)
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float log1p(const float& x) {
return ::log1pf(x);
}
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double log1p(const double& x) {
return ::log1p(x);
}
#endif
template <typename ScalarX, typename ScalarY>
EIGEN_DEVICE_FUNC inline
typename internal::pow_impl<ScalarX, ScalarY>::result_type
pow(const ScalarX& x, const ScalarY& y) {
return internal::pow_impl<ScalarX, ScalarY>::run(x, y);
}
#if defined(SYCL_DEVICE_ONLY)
SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(pow, pow)
#endif
template <typename T>
EIGEN_DEVICE_FUNC bool(isnan)(const T& x) {
return internal::isnan_impl(x);
}
template <typename T>
EIGEN_DEVICE_FUNC bool(isinf)(const T& x) {
return internal::isinf_impl(x);
}
template <typename T>
EIGEN_DEVICE_FUNC bool(isfinite)(const T& x) {
return internal::isfinite_impl(x);
}
#if defined(SYCL_DEVICE_ONLY)
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isnan, isnan, bool)
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isinf, isinf, bool)
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isfinite, isfinite, bool)
#endif
template <typename Scalar>
EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(rint, Scalar)
rint(const Scalar& x) {
return EIGEN_MATHFUNC_IMPL(rint, Scalar)::run(x);
}
template <typename Scalar>
EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(round, Scalar)
round(const Scalar& x) {
return EIGEN_MATHFUNC_IMPL(round, Scalar)::run(x);
}
#if defined(SYCL_DEVICE_ONLY)
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(round, round)
#endif
template <typename T>
EIGEN_DEVICE_FUNC T(floor)(const T& x) {
EIGEN_USING_STD_MATH(floor);
return floor(x);
}
#if defined(SYCL_DEVICE_ONLY)
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(floor, floor)
#endif
#if defined(EIGEN_GPUCC)
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float floor(const float& x) {
return ::floorf(x);
}
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double floor(const double& x) {
return ::floor(x);
}
#endif
template <typename T>
EIGEN_DEVICE_FUNC T(ceil)(const T& x) {
EIGEN_USING_STD_MATH(ceil);
return ceil(x);
}
#if defined(SYCL_DEVICE_ONLY)
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(ceil, ceil)
#endif
#if defined(EIGEN_GPUCC)
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float ceil(const float& x) {
return ::ceilf(x);
}
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double ceil(const double& x) {
return ::ceil(x);
}
#endif
/** Log base 2 for 32 bits positive integers.
* Conveniently returns 0 for x==0. */
inline int log2(int x) {
eigen_assert(x >= 0);
unsigned int v(x);
static const int table[32] = {0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16,
18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17,
24, 7, 19, 27, 23, 6, 26, 5, 4, 31};
v |= v >> 1;
v |= v >> 2;
v |= v >> 4;
v |= v >> 8;
v |= v >> 16;
return table[(v * 0x07C4ACDDU) >> 27];
}
/** \returns the square root of \a x.
*
* It is essentially equivalent to
* \code using std::sqrt; return sqrt(x); \endcode
* but slightly faster for float/double and some compilers (e.g., gcc), thanks
* to
* specializations when SSE is enabled.
*
* It's usage is justified in performance critical functions, like
* norm/normalize.
*/
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T sqrt(const T& x) {
EIGEN_USING_STD_MATH(sqrt);
return sqrt(x);
}
#if defined(SYCL_DEVICE_ONLY)
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sqrt, sqrt)
#endif
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T log(const T& x) {
EIGEN_USING_STD_MATH(log);
return log(x);
}
#if defined(SYCL_DEVICE_ONLY)
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(log, log)
#endif
#if defined(EIGEN_GPUCC)
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float log(const float& x) {
return ::logf(x);
}
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double log(const double& x) {
return ::log(x);
}
#endif
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
typename internal::enable_if<NumTraits<T>::IsSigned ||
NumTraits<T>::IsComplex,
typename NumTraits<T>::Real>::type
abs(const T& x) {
EIGEN_USING_STD_MATH(abs);
return abs(x);
}
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
typename internal::enable_if<!(NumTraits<T>::IsSigned ||
NumTraits<T>::IsComplex),
typename NumTraits<T>::Real>::type
abs(const T& x) {
return x;
}
#if defined(SYCL_DEVICE_ONLY)
SYCL_SPECIALIZE_INTEGER_TYPES_UNARY(abs, abs)
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(abs, fabs)
#endif
#if defined(EIGEN_GPUCC)
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float abs(const float& x) {
return ::fabsf(x);
}
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double abs(const double& x) {
return ::fabs(x);
}
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float abs(const std::complex<float>& x) {
return ::hypotf(x.real(), x.imag());
}
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double abs(
const std::complex<double>& x) {
return ::hypot(x.real(), x.imag());
}
#endif
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T exp(const T& x) {
EIGEN_USING_STD_MATH(exp);
return exp(x);
}
#if defined(SYCL_DEVICE_ONLY)
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(exp, exp)
#endif
#if defined(EIGEN_GPUCC)
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float exp(const float& x) {
return ::expf(x);
}
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double exp(const double& x) {
return ::exp(x);
}
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::complex<float> exp(
const std::complex<float>& x) {
float com = ::expf(x.real());
float res_real = com * ::cosf(x.imag());
float res_imag = com * ::sinf(x.imag());
return std::complex<float>(res_real, res_imag);
}
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::complex<double> exp(
const std::complex<double>& x) {
double com = ::exp(x.real());
double res_real = com * ::cos(x.imag());
double res_imag = com * ::sin(x.imag());
return std::complex<double>(res_real, res_imag);
}
#endif
template <typename Scalar>
EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(expm1, Scalar)
expm1(const Scalar& x) {
return EIGEN_MATHFUNC_IMPL(expm1, Scalar)::run(x);
}
#if defined(SYCL_DEVICE_ONLY)
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(expm1, expm1)
#endif
#if defined(EIGEN_GPUCC)
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float expm1(const float& x) {
return ::expm1f(x);
}
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double expm1(const double& x) {
return ::expm1(x);
}
#endif
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T cos(const T& x) {
EIGEN_USING_STD_MATH(cos);
return cos(x);
}
#if defined(SYCL_DEVICE_ONLY)
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(cos, cos)
#endif
#if defined(EIGEN_GPUCC)
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float cos(const float& x) {
return ::cosf(x);
}
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double cos(const double& x) {
return ::cos(x);
}
#endif
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T sin(const T& x) {
EIGEN_USING_STD_MATH(sin);
return sin(x);
}
#if defined(SYCL_DEVICE_ONLY)
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sin, sin)
#endif
#if defined(EIGEN_GPUCC)
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sin(const float& x) {
return ::sinf(x);
}
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double sin(const double& x) {
return ::sin(x);
}
#endif
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T tan(const T& x) {
EIGEN_USING_STD_MATH(tan);
return tan(x);
}
#if defined(SYCL_DEVICE_ONLY)
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(tan, tan)
#endif
#if defined(EIGEN_GPUCC)
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tan(const float& x) {
return ::tanf(x);
}
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double tan(const double& x) {
return ::tan(x);
}
#endif
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T acos(const T& x) {
EIGEN_USING_STD_MATH(acos);
return acos(x);
}
#if EIGEN_HAS_CXX11_MATH
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T acosh(const T& x) {
EIGEN_USING_STD_MATH(acosh);
return acosh(x);
}
#endif
#if defined(SYCL_DEVICE_ONLY)
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(acos, acos)
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(acosh, acosh)
#endif
#if defined(EIGEN_GPUCC)
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float acos(const float& x) {
return ::acosf(x);
}
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double acos(const double& x) {
return ::acos(x);
}
#endif
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T asin(const T& x) {
EIGEN_USING_STD_MATH(asin);
return asin(x);
}
#if EIGEN_HAS_CXX11_MATH
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T asinh(const T& x) {
EIGEN_USING_STD_MATH(asinh);
return asinh(x);
}
#endif
#if defined(SYCL_DEVICE_ONLY)
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(asin, asin)
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(asinh, asinh)
#endif
#if defined(EIGEN_GPUCC)
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float asin(const float& x) {
return ::asinf(x);
}
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double asin(const double& x) {
return ::asin(x);
}
#endif
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T atan(const T& x) {
EIGEN_USING_STD_MATH(atan);
return atan(x);
}
#if EIGEN_HAS_CXX11_MATH
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T atanh(const T& x) {
EIGEN_USING_STD_MATH(atanh);
return atanh(x);
}
#endif
#if defined(SYCL_DEVICE_ONLY)
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(atan, atan)
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(atanh, atanh)
#endif
#if defined(EIGEN_GPUCC)
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float atan(const float& x) {
return ::atanf(x);
}
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double atan(const double& x) {
return ::atan(x);
}
#endif
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T cosh(const T& x) {
EIGEN_USING_STD_MATH(cosh);
return cosh(x);
}
#if defined(SYCL_DEVICE_ONLY)
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(cosh, cosh)
#endif
#if defined(EIGEN_GPUCC)
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float cosh(const float& x) {
return ::coshf(x);
}
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double cosh(const double& x) {
return ::cosh(x);
}
#endif
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T sinh(const T& x) {
EIGEN_USING_STD_MATH(sinh);
return sinh(x);
}
#if defined(SYCL_DEVICE_ONLY)
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sinh, sinh)
#endif
#if defined(EIGEN_GPUCC)
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sinh(const float& x) {
return ::sinhf(x);
}
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double sinh(const double& x) {
return ::sinh(x);
}
#endif
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T tanh(const T& x) {
EIGEN_USING_STD_MATH(tanh);
return tanh(x);
}
#if (!defined(EIGEN_GPUCC)) && EIGEN_FAST_MATH && !defined(SYCL_DEVICE_ONLY)
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tanh(float x) {
return internal::generic_fast_tanh_float(x);
}
#endif
#if defined(SYCL_DEVICE_ONLY)
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(tanh, tanh)
#endif
#if defined(EIGEN_GPUCC)
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tanh(const float& x) {
return ::tanhf(x);
}
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double tanh(const double& x) {
return ::tanh(x);
}
#endif
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T fmod(const T& a, const T& b) {
EIGEN_USING_STD_MATH(fmod);
return fmod(a, b);
}
#if defined(SYCL_DEVICE_ONLY)
SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(fmod, fmod)
#endif
#if defined(EIGEN_GPUCC)
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float fmod(const float& a,
const float& b) {
return ::fmodf(a, b);
}
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double fmod(const double& a,
const double& b) {
return ::fmod(a, b);
}
#endif
#if defined(SYCL_DEVICE_ONLY)
#undef SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY
#undef SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY
#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY
#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY
#undef SYCL_SPECIALIZE_INTEGER_TYPES_BINARY
#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY
#undef SYCL_SPECIALIZE_FLOATING_TYPES_BINARY
#undef SYCL_SPECIALIZE_FLOATING_TYPES_UNARY
#undef SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE
#undef SYCL_SPECIALIZE_GEN_UNARY_FUNC
#undef SYCL_SPECIALIZE_UNARY_FUNC
#undef SYCL_SPECIALIZE_GEN1_BINARY_FUNC
#undef SYCL_SPECIALIZE_GEN2_BINARY_FUNC
#undef SYCL_SPECIALIZE_BINARY_FUNC
#endif
} // end namespace numext
namespace internal {
template <typename T>
EIGEN_DEVICE_FUNC bool isfinite_impl(const std::complex<T>& x) {
return (numext::isfinite)(numext::real(x)) &&
(numext::isfinite)(numext::imag(x));
}
template <typename T>
EIGEN_DEVICE_FUNC bool isnan_impl(const std::complex<T>& x) {
return (numext::isnan)(numext::real(x)) || (numext::isnan)(numext::imag(x));
}
template <typename T>
EIGEN_DEVICE_FUNC bool isinf_impl(const std::complex<T>& x) {
return ((numext::isinf)(numext::real(x)) ||
(numext::isinf)(numext::imag(x))) &&
(!(numext::isnan)(x));
}
/****************************************************************************
* Implementation of fuzzy comparisons *
****************************************************************************/
template <typename Scalar, bool IsComplex, bool IsInteger>
struct scalar_fuzzy_default_impl {};
template <typename Scalar>
struct scalar_fuzzy_default_impl<Scalar, false, false> {
typedef typename NumTraits<Scalar>::Real RealScalar;
template <typename OtherScalar>
EIGEN_DEVICE_FUNC static inline bool isMuchSmallerThan(
const Scalar& x, const OtherScalar& y, const RealScalar& prec) {
return numext::abs(x) <= numext::abs(y) * prec;
}
EIGEN_DEVICE_FUNC
static inline bool isApprox(const Scalar& x,
const Scalar& y,
const RealScalar& prec) {
return numext::abs(x - y) <=
numext::mini(numext::abs(x), numext::abs(y)) * prec;
}
EIGEN_DEVICE_FUNC
static inline bool isApproxOrLessThan(const Scalar& x,
const Scalar& y,
const RealScalar& prec) {
return x <= y || isApprox(x, y, prec);
}
};
template <typename Scalar>
struct scalar_fuzzy_default_impl<Scalar, false, true> {
typedef typename NumTraits<Scalar>::Real RealScalar;
template <typename OtherScalar>
EIGEN_DEVICE_FUNC static inline bool isMuchSmallerThan(const Scalar& x,
const Scalar&,
const RealScalar&) {
return x == Scalar(0);
}
EIGEN_DEVICE_FUNC
static inline bool isApprox(const Scalar& x,
const Scalar& y,
const RealScalar&) {
return x == y;
}
EIGEN_DEVICE_FUNC
static inline bool isApproxOrLessThan(const Scalar& x,
const Scalar& y,
const RealScalar&) {
return x <= y;
}
};
template <typename Scalar>
struct scalar_fuzzy_default_impl<Scalar, true, false> {
typedef typename NumTraits<Scalar>::Real RealScalar;
template <typename OtherScalar>
EIGEN_DEVICE_FUNC static inline bool isMuchSmallerThan(
const Scalar& x, const OtherScalar& y, const RealScalar& prec) {
return numext::abs2(x) <= numext::abs2(y) * prec * prec;
}
EIGEN_DEVICE_FUNC
static inline bool isApprox(const Scalar& x,
const Scalar& y,
const RealScalar& prec) {
return numext::abs2(x - y) <=
numext::mini(numext::abs2(x), numext::abs2(y)) * prec * prec;
}
};
template <typename Scalar>
struct scalar_fuzzy_impl
: scalar_fuzzy_default_impl<Scalar,
NumTraits<Scalar>::IsComplex,
NumTraits<Scalar>::IsInteger> {};
template <typename Scalar, typename OtherScalar>
EIGEN_DEVICE_FUNC inline bool isMuchSmallerThan(
const Scalar& x,
const OtherScalar& y,
const typename NumTraits<Scalar>::Real& precision =
NumTraits<Scalar>::dummy_precision()) {
return scalar_fuzzy_impl<Scalar>::template isMuchSmallerThan<OtherScalar>(
x, y, precision);
}
template <typename Scalar>
EIGEN_DEVICE_FUNC inline bool isApprox(
const Scalar& x,
const Scalar& y,
const typename NumTraits<Scalar>::Real& precision =
NumTraits<Scalar>::dummy_precision()) {
return scalar_fuzzy_impl<Scalar>::isApprox(x, y, precision);
}
template <typename Scalar>
EIGEN_DEVICE_FUNC inline bool isApproxOrLessThan(
const Scalar& x,
const Scalar& y,
const typename NumTraits<Scalar>::Real& precision =
NumTraits<Scalar>::dummy_precision()) {
return scalar_fuzzy_impl<Scalar>::isApproxOrLessThan(x, y, precision);
}
/******************************************
*** The special case of the bool type ***
******************************************/
template <>
struct random_impl<bool> {
static inline bool run() { return random<int>(0, 1) == 0 ? false : true; }
};
template <>
struct scalar_fuzzy_impl<bool> {
typedef bool RealScalar;
template <typename OtherScalar>
EIGEN_DEVICE_FUNC static inline bool isMuchSmallerThan(const bool& x,
const bool&,
const bool&) {
return !x;
}
EIGEN_DEVICE_FUNC
static inline bool isApprox(bool x, bool y, bool) { return x == y; }
EIGEN_DEVICE_FUNC
static inline bool isApproxOrLessThan(const bool& x,
const bool& y,
const bool&) {
return (!x) || y;
}
};
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_MATHFUNCTIONS_H
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// clang-format off
#ifndef EIGEN_META_H
#define EIGEN_META_H
#if defined(EIGEN_GPU_COMPILE_PHASE)
#include <cfloat>
#if defined(EIGEN_CUDA_ARCH)
#include <math_constants.h>
#endif
#if defined(EIGEN_HIP_DEVICE_COMPILE)
#include "Eigen/src/Core/arch/HIP/hcc/math_constants.h"
#endif
#endif
#if EIGEN_COMP_ICC>=1600 && __cplusplus >= 201103L
#include <cstdint>
#endif
namespace Eigen {
typedef EIGEN_DEFAULT_DENSE_INDEX_TYPE DenseIndex;
/**
* \brief The Index type as used for the API.
* \details To change this, \c \#define the preprocessor symbol \c EIGEN_DEFAULT_DENSE_INDEX_TYPE.
* \sa \blank \ref TopicPreprocessorDirectives, StorageIndex.
*/
typedef EIGEN_DEFAULT_DENSE_INDEX_TYPE Index;
namespace internal {
/** \internal
* \file Meta.h
* This file contains generic metaprogramming classes which are not specifically related to Eigen.
* \note In case you wonder, yes we're aware that Boost already provides all these features,
* we however don't want to add a dependency to Boost.
*/
// Only recent versions of ICC complain about using ptrdiff_t to hold pointers,
// and older versions do not provide *intptr_t types.
#if EIGEN_COMP_ICC>=1600 && __cplusplus >= 201103L
typedef std::intptr_t IntPtr;
typedef std::uintptr_t UIntPtr;
#else
typedef std::ptrdiff_t IntPtr;
typedef std::size_t UIntPtr;
#endif
struct true_type { enum { value = 1 }; };
struct false_type { enum { value = 0 }; };
template<bool Condition>
struct bool_constant;
template<>
struct bool_constant<true> : true_type {};
template<>
struct bool_constant<false> : false_type {};
template<bool Condition, typename Then, typename Else>
struct conditional { typedef Then type; };
template<typename Then, typename Else>
struct conditional <false, Then, Else> { typedef Else type; };
template<typename T> struct remove_reference { typedef T type; };
template<typename T> struct remove_reference<T&> { typedef T type; };
template<typename T> struct remove_pointer { typedef T type; };
template<typename T> struct remove_pointer<T*> { typedef T type; };
template<typename T> struct remove_pointer<T*const> { typedef T type; };
template <class T> struct remove_const { typedef T type; };
template <class T> struct remove_const<const T> { typedef T type; };
template <class T> struct remove_const<const T[]> { typedef T type[]; };
template <class T, unsigned int Size> struct remove_const<const T[Size]> { typedef T type[Size]; };
template<typename T> struct remove_all { typedef T type; };
template<typename T> struct remove_all<const T> { typedef typename remove_all<T>::type type; };
template<typename T> struct remove_all<T const&> { typedef typename remove_all<T>::type type; };
template<typename T> struct remove_all<T&> { typedef typename remove_all<T>::type type; };
template<typename T> struct remove_all<T const*> { typedef typename remove_all<T>::type type; };
template<typename T> struct remove_all<T*> { typedef typename remove_all<T>::type type; };
template<typename T> struct is_arithmetic { enum { value = false }; };
template<> struct is_arithmetic<float> { enum { value = true }; };
template<> struct is_arithmetic<double> { enum { value = true }; };
template<> struct is_arithmetic<long double> { enum { value = true }; };
template<> struct is_arithmetic<bool> { enum { value = true }; };
template<> struct is_arithmetic<char> { enum { value = true }; };
template<> struct is_arithmetic<signed char> { enum { value = true }; };
template<> struct is_arithmetic<unsigned char> { enum { value = true }; };
template<> struct is_arithmetic<signed short> { enum { value = true }; };
template<> struct is_arithmetic<unsigned short>{ enum { value = true }; };
template<> struct is_arithmetic<signed int> { enum { value = true }; };
template<> struct is_arithmetic<unsigned int> { enum { value = true }; };
template<> struct is_arithmetic<signed long> { enum { value = true }; };
template<> struct is_arithmetic<unsigned long> { enum { value = true }; };
template<typename T, typename U> struct is_same { enum { value = 0 }; };
template<typename T> struct is_same<T,T> { enum { value = 1 }; };
template< class T >
struct is_void : is_same<void, typename remove_const<T>::type> {};
#if EIGEN_HAS_CXX11
template<> struct is_arithmetic<signed long long> { enum { value = true }; };
template<> struct is_arithmetic<unsigned long long> { enum { value = true }; };
using std::is_integral;
#else
template<typename T> struct is_integral { enum { value = false }; };
template<> struct is_integral<bool> { enum { value = true }; };
template<> struct is_integral<char> { enum { value = true }; };
template<> struct is_integral<signed char> { enum { value = true }; };
template<> struct is_integral<unsigned char> { enum { value = true }; };
template<> struct is_integral<signed short> { enum { value = true }; };
template<> struct is_integral<unsigned short> { enum { value = true }; };
template<> struct is_integral<signed int> { enum { value = true }; };
template<> struct is_integral<unsigned int> { enum { value = true }; };
template<> struct is_integral<signed long> { enum { value = true }; };
template<> struct is_integral<unsigned long> { enum { value = true }; };
#if EIGEN_COMP_MSVC
template<> struct is_integral<signed __int64> { enum { value = true }; };
template<> struct is_integral<unsigned __int64> { enum { value = true }; };
#endif
#endif
#if EIGEN_HAS_CXX11
using std::make_unsigned;
#else
// TODO: Possibly improve this implementation of make_unsigned.
// It is currently used only by
// template<typename Scalar> struct random_default_impl<Scalar, false, true>.
template<typename> struct make_unsigned;
template<> struct make_unsigned<char> { typedef unsigned char type; };
template<> struct make_unsigned<signed char> { typedef unsigned char type; };
template<> struct make_unsigned<unsigned char> { typedef unsigned char type; };
template<> struct make_unsigned<signed short> { typedef unsigned short type; };
template<> struct make_unsigned<unsigned short> { typedef unsigned short type; };
template<> struct make_unsigned<signed int> { typedef unsigned int type; };
template<> struct make_unsigned<unsigned int> { typedef unsigned int type; };
template<> struct make_unsigned<signed long> { typedef unsigned long type; };
template<> struct make_unsigned<unsigned long> { typedef unsigned long type; };
#if EIGEN_COMP_MSVC
template<> struct make_unsigned<signed __int64> { typedef unsigned __int64 type; };
template<> struct make_unsigned<unsigned __int64> { typedef unsigned __int64 type; };
#endif
#endif
template <typename T> struct add_const { typedef const T type; };
template <typename T> struct add_const<T&> { typedef T& type; };
template <typename T> struct is_const { enum { value = 0 }; };
template <typename T> struct is_const<T const> { enum { value = 1 }; };
template<typename T> struct add_const_on_value_type { typedef const T type; };
template<typename T> struct add_const_on_value_type<T&> { typedef T const& type; };
template<typename T> struct add_const_on_value_type<T*> { typedef T const* type; };
template<typename T> struct add_const_on_value_type<T* const> { typedef T const* const type; };
template<typename T> struct add_const_on_value_type<T const* const> { typedef T const* const type; };
#if EIGEN_HAS_CXX11
using std::is_convertible;
#else
template<typename From, typename To>
struct is_convertible_impl
{
private:
struct any_conversion
{
template <typename T> any_conversion(const volatile T&);
template <typename T> any_conversion(T&);
};
struct yes {int a[1];};
struct no {int a[2];};
template<typename T>
static yes test(T, int);
template<typename T>
static no test(any_conversion, ...);
public:
static typename internal::remove_reference<From>::type* ms_from;
#ifdef __INTEL_COMPILER
#pragma warning push
#pragma warning ( disable : 2259 )
#endif
enum { value = sizeof(test<To>(*ms_from, 0))==sizeof(yes) };
#ifdef __INTEL_COMPILER
#pragma warning pop
#endif
};
template<typename From, typename To>
struct is_convertible
{
enum { value = is_convertible_impl<From,To>::value };
};
template<typename T>
struct is_convertible<T,T&> { enum { value = false }; };
template<typename T>
struct is_convertible<const T,const T&> { enum { value = true }; };
#endif
/** \internal Allows to enable/disable an overload
* according to a compile time condition.
*/
template<bool Condition, typename T=void> struct enable_if;
template<typename T> struct enable_if<true,T>
{ typedef T type; };
#if defined(EIGEN_GPU_COMPILE_PHASE)
#if !defined(__FLT_EPSILON__)
#define __FLT_EPSILON__ FLT_EPSILON
#define __DBL_EPSILON__ DBL_EPSILON
#endif
namespace device {
template<typename T> struct numeric_limits
{
EIGEN_DEVICE_FUNC static T epsilon() { return 0; }
EIGEN_DEVICE_FUNC static T (max)() { assert(false && "Highest not supported for this type"); }
EIGEN_DEVICE_FUNC static T (min)() { assert(false && "Lowest not supported for this type"); }
EIGEN_DEVICE_FUNC static T infinity() { assert(false && "Infinity not supported for this type"); }
EIGEN_DEVICE_FUNC static T quiet_NaN() { assert(false && "quiet_NaN not supported for this type"); }
};
template<> struct numeric_limits<float>
{
EIGEN_DEVICE_FUNC
static float epsilon() { return __FLT_EPSILON__; }
EIGEN_DEVICE_FUNC
static float (max)() {
#if defined(EIGEN_CUDA_ARCH)
return CUDART_MAX_NORMAL_F;
#else
return HIPRT_MAX_NORMAL_F;
#endif
}
EIGEN_DEVICE_FUNC
static float (min)() { return FLT_MIN; }
EIGEN_DEVICE_FUNC
static float infinity() {
#if defined(EIGEN_CUDA_ARCH)
return CUDART_INF_F;
#else
return HIPRT_INF_F;
#endif
}
EIGEN_DEVICE_FUNC
static float quiet_NaN() {
#if defined(EIGEN_CUDA_ARCH)
return CUDART_NAN_F;
#else
return HIPRT_NAN_F;
#endif
}
};
template<> struct numeric_limits<double>
{
EIGEN_DEVICE_FUNC
static double epsilon() { return __DBL_EPSILON__; }
EIGEN_DEVICE_FUNC
static double (max)() { return DBL_MAX; }
EIGEN_DEVICE_FUNC
static double (min)() { return DBL_MIN; }
EIGEN_DEVICE_FUNC
static double infinity() {
#if defined(EIGEN_CUDA_ARCH)
return CUDART_INF;
#else
return HIPRT_INF;
#endif
}
EIGEN_DEVICE_FUNC
static double quiet_NaN() {
#if defined(EIGEN_CUDA_ARCH)
return CUDART_NAN;
#else
return HIPRT_NAN;
#endif
}
};
template<> struct numeric_limits<int>
{
EIGEN_DEVICE_FUNC
static int epsilon() { return 0; }
EIGEN_DEVICE_FUNC
static int (max)() { return INT_MAX; }
EIGEN_DEVICE_FUNC
static int (min)() { return INT_MIN; }
};
template<> struct numeric_limits<unsigned int>
{
EIGEN_DEVICE_FUNC
static unsigned int epsilon() { return 0; }
EIGEN_DEVICE_FUNC
static unsigned int (max)() { return UINT_MAX; }
EIGEN_DEVICE_FUNC
static unsigned int (min)() { return 0; }
};
template<> struct numeric_limits<long>
{
EIGEN_DEVICE_FUNC
static long epsilon() { return 0; }
EIGEN_DEVICE_FUNC
static long (max)() { return LONG_MAX; }
EIGEN_DEVICE_FUNC
static long (min)() { return LONG_MIN; }
};
template<> struct numeric_limits<unsigned long>
{
EIGEN_DEVICE_FUNC
static unsigned long epsilon() { return 0; }
EIGEN_DEVICE_FUNC
static unsigned long (max)() { return ULONG_MAX; }
EIGEN_DEVICE_FUNC
static unsigned long (min)() { return 0; }
};
template<> struct numeric_limits<long long>
{
EIGEN_DEVICE_FUNC
static long long epsilon() { return 0; }
EIGEN_DEVICE_FUNC
static long long (max)() { return LLONG_MAX; }
EIGEN_DEVICE_FUNC
static long long (min)() { return LLONG_MIN; }
};
template<> struct numeric_limits<unsigned long long>
{
EIGEN_DEVICE_FUNC
static unsigned long long epsilon() { return 0; }
EIGEN_DEVICE_FUNC
static unsigned long long (max)() { return ULLONG_MAX; }
EIGEN_DEVICE_FUNC
static unsigned long long (min)() { return 0; }
};
}
#endif
/** \internal
* A base class do disable default copy ctor and copy assignment operator.
*/
class noncopyable
{
EIGEN_DEVICE_FUNC noncopyable(const noncopyable&);
EIGEN_DEVICE_FUNC const noncopyable& operator=(const noncopyable&);
protected:
EIGEN_DEVICE_FUNC noncopyable() {}
EIGEN_DEVICE_FUNC ~noncopyable() {}
};
/** \internal
* Provides access to the number of elements in the object of as a compile-time constant expression.
* It "returns" Eigen::Dynamic if the size cannot be resolved at compile-time (default).
*
* Similar to std::tuple_size, but more general.
*
* It currently supports:
* - any types T defining T::SizeAtCompileTime
* - plain C arrays as T[N]
* - std::array (c++11)
* - some internal types such as SingleRange and AllRange
*
* The second template parameter eases SFINAE-based specializations.
*/
template<typename T, typename EnableIf = void> struct array_size {
enum { value = Dynamic };
};
template<typename T> struct array_size<T,typename internal::enable_if<((T::SizeAtCompileTime&0)==0)>::type> {
enum { value = T::SizeAtCompileTime };
};
template<typename T, int N> struct array_size<const T (&)[N]> {
enum { value = N };
};
template<typename T, int N> struct array_size<T (&)[N]> {
enum { value = N };
};
#if EIGEN_HAS_CXX11
template<typename T, std::size_t N> struct array_size<const std::array<T,N> > {
enum { value = N };
};
template<typename T, std::size_t N> struct array_size<std::array<T,N> > {
enum { value = N };
};
#endif
/** \internal
* Analogue of the std::size free function.
* It returns the size of the container or view \a x of type \c T
*
* It currently supports:
* - any types T defining a member T::size() const
* - plain C arrays as T[N]
*
*/
template<typename T>
Index size(const T& x) { return x.size(); }
template<typename T,std::size_t N>
Index size(const T (&) [N]) { return N; }
/** \internal
* Convenient struct to get the result type of a unary or binary functor.
*
* It supports both the current STL mechanism (using the result_type member) as well as
* upcoming next STL generation (using a templated result member).
* If none of these members is provided, then the type of the first argument is returned. FIXME, that behavior is a pretty bad hack.
*/
#if EIGEN_HAS_STD_RESULT_OF
template<typename T> struct result_of {
typedef typename std::result_of<T>::type type1;
typedef typename remove_all<type1>::type type;
};
#else
template<typename T> struct result_of { };
struct has_none {int a[1];};
struct has_std_result_type {int a[2];};
struct has_tr1_result {int a[3];};
template<typename Func, typename ArgType, int SizeOf=sizeof(has_none)>
struct unary_result_of_select {typedef typename internal::remove_all<ArgType>::type type;};
template<typename Func, typename ArgType>
struct unary_result_of_select<Func, ArgType, sizeof(has_std_result_type)> {typedef typename Func::result_type type;};
template<typename Func, typename ArgType>
struct unary_result_of_select<Func, ArgType, sizeof(has_tr1_result)> {typedef typename Func::template result<Func(ArgType)>::type type;};
template<typename Func, typename ArgType>
struct result_of<Func(ArgType)> {
template<typename T>
static has_std_result_type testFunctor(T const *, typename T::result_type const * = 0);
template<typename T>
static has_tr1_result testFunctor(T const *, typename T::template result<T(ArgType)>::type const * = 0);
static has_none testFunctor(...);
// note that the following indirection is needed for gcc-3.3
enum {FunctorType = sizeof(testFunctor(static_cast<Func*>(0)))};
typedef typename unary_result_of_select<Func, ArgType, FunctorType>::type type;
};
template<typename Func, typename ArgType0, typename ArgType1, int SizeOf=sizeof(has_none)>
struct binary_result_of_select {typedef typename internal::remove_all<ArgType0>::type type;};
template<typename Func, typename ArgType0, typename ArgType1>
struct binary_result_of_select<Func, ArgType0, ArgType1, sizeof(has_std_result_type)>
{typedef typename Func::result_type type;};
template<typename Func, typename ArgType0, typename ArgType1>
struct binary_result_of_select<Func, ArgType0, ArgType1, sizeof(has_tr1_result)>
{typedef typename Func::template result<Func(ArgType0,ArgType1)>::type type;};
template<typename Func, typename ArgType0, typename ArgType1>
struct result_of<Func(ArgType0,ArgType1)> {
template<typename T>
static has_std_result_type testFunctor(T const *, typename T::result_type const * = 0);
template<typename T>
static has_tr1_result testFunctor(T const *, typename T::template result<T(ArgType0,ArgType1)>::type const * = 0);
static has_none testFunctor(...);
// note that the following indirection is needed for gcc-3.3
enum {FunctorType = sizeof(testFunctor(static_cast<Func*>(0)))};
typedef typename binary_result_of_select<Func, ArgType0, ArgType1, FunctorType>::type type;
};
template<typename Func, typename ArgType0, typename ArgType1, typename ArgType2, int SizeOf=sizeof(has_none)>
struct ternary_result_of_select {typedef typename internal::remove_all<ArgType0>::type type;};
template<typename Func, typename ArgType0, typename ArgType1, typename ArgType2>
struct ternary_result_of_select<Func, ArgType0, ArgType1, ArgType2, sizeof(has_std_result_type)>
{typedef typename Func::result_type type;};
template<typename Func, typename ArgType0, typename ArgType1, typename ArgType2>
struct ternary_result_of_select<Func, ArgType0, ArgType1, ArgType2, sizeof(has_tr1_result)>
{typedef typename Func::template result<Func(ArgType0,ArgType1,ArgType2)>::type type;};
template<typename Func, typename ArgType0, typename ArgType1, typename ArgType2>
struct result_of<Func(ArgType0,ArgType1,ArgType2)> {
template<typename T>
static has_std_result_type testFunctor(T const *, typename T::result_type const * = 0);
template<typename T>
static has_tr1_result testFunctor(T const *, typename T::template result<T(ArgType0,ArgType1,ArgType2)>::type const * = 0);
static has_none testFunctor(...);
// note that the following indirection is needed for gcc-3.3
enum {FunctorType = sizeof(testFunctor(static_cast<Func*>(0)))};
typedef typename ternary_result_of_select<Func, ArgType0, ArgType1, ArgType2, FunctorType>::type type;
};
#endif
struct meta_yes { char a[1]; };
struct meta_no { char a[2]; };
// Check whether T::ReturnType does exist
template <typename T>
struct has_ReturnType
{
template <typename C> static meta_yes testFunctor(C const *, typename C::ReturnType const * = 0);
template <typename C> static meta_no testFunctor(...);
enum { value = sizeof(testFunctor<T>(static_cast<T*>(0))) == sizeof(meta_yes) };
};
template<typename T> const T* return_ptr();
template <typename T, typename IndexType=Index>
struct has_nullary_operator
{
template <typename C> static meta_yes testFunctor(C const *,typename enable_if<(sizeof(return_ptr<C>()->operator()())>0)>::type * = 0);
static meta_no testFunctor(...);
enum { value = sizeof(testFunctor(static_cast<T*>(0))) == sizeof(meta_yes) };
};
template <typename T, typename IndexType=Index>
struct has_unary_operator
{
template <typename C> static meta_yes testFunctor(C const *,typename enable_if<(sizeof(return_ptr<C>()->operator()(IndexType(0)))>0)>::type * = 0);
static meta_no testFunctor(...);
enum { value = sizeof(testFunctor(static_cast<T*>(0))) == sizeof(meta_yes) };
};
template <typename T, typename IndexType=Index>
struct has_binary_operator
{
template <typename C> static meta_yes testFunctor(C const *,typename enable_if<(sizeof(return_ptr<C>()->operator()(IndexType(0),IndexType(0)))>0)>::type * = 0);
static meta_no testFunctor(...);
enum { value = sizeof(testFunctor(static_cast<T*>(0))) == sizeof(meta_yes) };
};
/** \internal In short, it computes int(sqrt(\a Y)) with \a Y an integer.
* Usage example: \code meta_sqrt<1023>::ret \endcode
*/
template<int Y,
int InfX = 0,
int SupX = ((Y==1) ? 1 : Y/2),
bool Done = ((SupX-InfX)<=1 ? true : ((SupX*SupX <= Y) && ((SupX+1)*(SupX+1) > Y))) >
// use ?: instead of || just to shut up a stupid gcc 4.3 warning
class meta_sqrt
{
enum {
MidX = (InfX+SupX)/2,
TakeInf = MidX*MidX > Y ? 1 : 0,
NewInf = int(TakeInf) ? InfX : int(MidX),
NewSup = int(TakeInf) ? int(MidX) : SupX
};
public:
enum { ret = meta_sqrt<Y,NewInf,NewSup>::ret };
};
template<int Y, int InfX, int SupX>
class meta_sqrt<Y, InfX, SupX, true> { public: enum { ret = (SupX*SupX <= Y) ? SupX : InfX }; };
/** \internal Computes the least common multiple of two positive integer A and B
* at compile-time. It implements a naive algorithm testing all multiples of A.
* It thus works better if A>=B.
*/
template<int A, int B, int K=1, bool Done = ((A*K)%B)==0>
struct meta_least_common_multiple
{
enum { ret = meta_least_common_multiple<A,B,K+1>::ret };
};
template<int A, int B, int K>
struct meta_least_common_multiple<A,B,K,true>
{
enum { ret = A*K };
};
/** \internal determines whether the product of two numeric types is allowed and what the return type is */
template<typename T, typename U> struct scalar_product_traits
{
enum { Defined = 0 };
};
// FIXME quick workaround around current limitation of result_of
// template<typename Scalar, typename ArgType0, typename ArgType1>
// struct result_of<scalar_product_op<Scalar>(ArgType0,ArgType1)> {
// typedef typename scalar_product_traits<typename remove_all<ArgType0>::type, typename remove_all<ArgType1>::type>::ReturnType type;
// };
/** \internal Obtains a POD type suitable to use as storage for an object of a size
* of at most Len bytes, aligned as specified by \c Align.
*/
template<unsigned Len, unsigned Align>
struct aligned_storage {
struct type {
EIGEN_ALIGN_TO_BOUNDARY(Align) unsigned char data[Len];
};
};
} // end namespace internal
namespace numext {
#if defined(EIGEN_GPU_COMPILE_PHASE)
template<typename T> EIGEN_DEVICE_FUNC void swap(T &a, T &b) { T tmp = b; b = a; a = tmp; }
#else
template<typename T> EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); }
#endif
#if defined(EIGEN_GPU_COMPILE_PHASE)
using internal::device::numeric_limits;
#else
using std::numeric_limits;
#endif
// Integer division with rounding up.
// T is assumed to be an integer type with a>=0, and b>0
template<typename T>
EIGEN_DEVICE_FUNC
T div_ceil(const T &a, const T &b)
{
return (a+b-1) / b;
}
// The aim of the following functions is to bypass -Wfloat-equal warnings
// when we really want a strict equality comparison on floating points.
template<typename X, typename Y> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
bool equal_strict(const X& x,const Y& y) { return x == y; }
#if !defined(EIGEN_GPU_COMPILE_PHASE) || (!defined(EIGEN_CUDA_ARCH) && defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC))
template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
bool equal_strict(const float& x,const float& y) { return std::equal_to<float>()(x,y); }
template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
bool equal_strict(const double& x,const double& y) { return std::equal_to<double>()(x,y); }
#endif
template<typename X, typename Y> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
bool not_equal_strict(const X& x,const Y& y) { return x != y; }
#if !defined(EIGEN_GPU_COMPILE_PHASE) || (!defined(EIGEN_CUDA_ARCH) && defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC))
template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
bool not_equal_strict(const float& x,const float& y) { return std::not_equal_to<float>()(x,y); }
template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
bool not_equal_strict(const double& x,const double& y) { return std::not_equal_to<double>()(x,y); }
#endif
/** \internal extract the bits of the float \a x */
inline unsigned int as_uint(float x)
{
unsigned int ret;
std::memcpy(&ret, &x, sizeof(float));
return ret;
}
} // end namespace numext
} // end namespace Eigen
// Define portable (u)int{32,64} types
#if EIGEN_HAS_CXX11
#include <cstdint>
namespace Eigen {
namespace numext {
typedef std::uint8_t uint8_t;
typedef std::int8_t int8_t;
typedef std::uint16_t uint16_t;
typedef std::int16_t int16_t;
typedef std::uint32_t uint32_t;
typedef std::int32_t int32_t;
typedef std::uint64_t uint64_t;
typedef std::int64_t int64_t;
}
}
#else
// Without c++11, all compilers able to compile Eigen also
// provides the C99 stdint.h header file.
#include <stdint.h>
namespace Eigen {
namespace numext {
typedef ::uint8_t uint8_t;
typedef ::int8_t int8_t;
typedef ::uint16_t uint16_t;
typedef ::int16_t int16_t;
typedef ::uint32_t uint32_t;
typedef ::int32_t int32_t;
typedef ::uint64_t uint64_t;
typedef ::int64_t int64_t;
}
}
#endif
#endif // EIGEN_META_H
// clang-format on
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
//#ifndef EIGEN_CXX11_TENSOR_MODULE
//#define EIGEN_CXX11_TENSOR_MODULE
#include "../../../Eigen/Core"
#if EIGEN_HAS_CXX11
#include "../SpecialFunctions"
#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h"
#include "src/util/CXX11Meta.h"
#include "src/util/MaxSizeVector.h"
/** \defgroup CXX11_Tensor_Module Tensor Module
*
* This module provides a Tensor class for storing arbitrarily indexed
* objects.
*
* \code
* #include <Eigen/CXX11/Tensor>
* \endcode
*
* Much of the documentation can be found \ref eigen_tensors "here".
*/
#include <cmath>
#include <cstddef>
#include <cstring>
#include <random>
#ifdef _WIN32
typedef __int16 int16_t;
typedef unsigned __int16 uint16_t;
typedef __int32 int32_t;
typedef unsigned __int32 uint32_t;
typedef __int64 int64_t;
typedef unsigned __int64 uint64_t;
#include <windows.h>
#else
#include <stdint.h>
#include <unistd.h>
#endif
#ifdef _WIN32
#include <windows.h>
#elif defined(__APPLE__)
#include <mach/mach_time.h>
#else
#include <time.h>
#endif
#if defined(EIGEN_USE_THREADS) || defined(EIGEN_USE_SYCL)
#include "ThreadPool"
#endif
#ifdef EIGEN_USE_GPU
#include <iostream>
#if defined(EIGEN_USE_HIP)
#include <hip/hip_runtime.h>
#else
#include <cuda_runtime.h>
#endif
#include <atomic>
#endif
#include "src/Tensor/TensorMacros.h"
#include "src/Tensor/TensorForwardDeclarations.h"
#include "src/Tensor/TensorMeta.h"
#include "src/Tensor/TensorFunctors.h"
#include "src/Tensor/TensorCostModel.h"
#include "src/Tensor/TensorDeviceDefault.h"
#include "src/Tensor/TensorDeviceThreadPool.h"
#include "src/Tensor/TensorDeviceGpu.h"
#ifndef gpu_assert
#define gpu_assert(x)
#endif
#include "src/Tensor/TensorDeviceSycl.h"
#include "src/Tensor/TensorIndexList.h"
#include "src/Tensor/TensorDimensionList.h"
#include "src/Tensor/TensorDimensions.h"
#include "src/Tensor/TensorInitializer.h"
#include "src/Tensor/TensorTraits.h"
#include "src/Tensor/TensorRandom.h"
#include "src/Tensor/TensorUInt128.h"
#include "src/Tensor/TensorIntDiv.h"
#include "src/Tensor/TensorGlobalFunctions.h"
#include "src/Tensor/TensorBase.h"
#include "src/Tensor/TensorBlock.h"
#include "src/Tensor/TensorEvaluator.h"
#include "src/Tensor/TensorExpr.h"
#include "src/Tensor/TensorReduction.h"
#include "src/Tensor/TensorReductionGpu.h"
#include "src/Tensor/TensorArgMax.h"
#include "src/Tensor/TensorConcatenation.h"
#include "src/Tensor/TensorContractionMapper.h"
#include "src/Tensor/TensorContractionBlocking.h"
#include "src/Tensor/TensorContraction.h"
#include "src/Tensor/TensorContractionThreadPool.h"
#include "src/Tensor/TensorContractionGpu.h"
#include "src/Tensor/TensorConversion.h"
#include "src/Tensor/TensorConvolution.h"
#include "src/Tensor/TensorFFT.h"
#include "src/Tensor/TensorPatch.h"
#include "src/Tensor/TensorImagePatch.h"
#include "src/Tensor/TensorVolumePatch.h"
#include "src/Tensor/TensorBroadcasting.h"
#include "src/Tensor/TensorChipping.h"
#include "src/Tensor/TensorInflation.h"
#include "src/Tensor/TensorLayoutSwap.h"
#include "src/Tensor/TensorMorphing.h"
#include "src/Tensor/TensorPadding.h"
#include "src/Tensor/TensorReverse.h"
#include "src/Tensor/TensorShuffling.h"
#include "src/Tensor/TensorStriding.h"
#include "src/Tensor/TensorCustomOp.h"
#include "src/Tensor/TensorEvalTo.h"
#include "src/Tensor/TensorForcedEval.h"
#include "src/Tensor/TensorGenerator.h"
#include "src/Tensor/TensorAssign.h"
#include "src/Tensor/TensorScan.h"
#include "src/Tensor/TensorTrace.h"
#ifdef EIGEN_USE_SYCL
#include "src/Tensor/TensorReductionSycl.h"
#include "src/Tensor/TensorConvolutionSycl.h"
#include "src/Tensor/TensorContractionSycl.h"
#include "src/Tensor/TensorScanSycl.h"
#endif
#include "src/Tensor/TensorExecutor.h"
#include "src/Tensor/TensorDevice.h"
#include "src/Tensor/TensorStorage.h"
#include "src/Tensor/Tensor.h"
#include "src/Tensor/TensorFixedSize.h"
#include "src/Tensor/TensorMap.h"
#include "src/Tensor/TensorRef.h"
#include "src/Tensor/TensorIO.h"
#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h"
#endif // EIGEN_HAS_CXX11
//#endif // EIGEN_CXX11_TENSOR_MODULE
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
#define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
namespace Eigen {
namespace internal {
// -------------------------------------------------------------------------- //
// Forward declarations for templates defined below.
template <typename Scalar, typename IndexType, int NumDims, int Layout>
class TensorBlockIO;
// -------------------------------------------------------------------------- //
// Helper function to compute strides for densely stored buffer of given
// dimensions.
// TODO(ezhulenev): We compute strides 1000 times in different evaluators, use
// this function instead everywhere.
template <int Layout, typename IndexType, int NumDims>
EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
const DSizes<IndexType, NumDims>& dimensions) {
DSizes<IndexType, NumDims> strides;
if (NumDims == 0) return strides;
// TODO(ezhulenev): Use templates to unroll this loop (similar to
// h_array_reduce in CXX11meta.h)? Benchmark it.
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
strides[0] = 1;
for (int i = 1; i < NumDims; ++i) {
strides[i] = strides[i - 1] * dimensions[i - 1];
}
} else {
strides[NumDims - 1] = 1;
for (int i = NumDims - 2; i >= 0; --i) {
strides[i] = strides[i + 1] * dimensions[i + 1];
}
}
return strides;
}
template <int Layout, typename IndexType, size_t NumDims>
EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
const Eigen::array<IndexType, NumDims>& dimensions) {
return strides<Layout>(DSizes<IndexType, NumDims>(dimensions));
}
template <int Layout, std::ptrdiff_t... Indices>
EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t, sizeof...(Indices)> strides(
const Sizes<Indices...>& sizes) {
return strides<Layout>(DSizes<std::ptrdiff_t, sizeof...(Indices)>(sizes));
}
// -------------------------------------------------------------------------- //
// Tensor block shape type defines what are the shape preference for the blocks
// extracted from the larger tensor.
//
// Example: blocks of 100 elements from the large 100x100 tensor:
// - tensor: 100x100
// - target_block_size: 100
//
// TensorBlockShapeType:
// - kUniformAllDims: 100 blocks of size 10x10
// - kSkewedInnerDims: 100 blocks of size 100x1 (or 1x100 depending on a column
// or row major layout)
enum class TensorBlockShapeType { kUniformAllDims, kSkewedInnerDims };
struct TensorBlockResourceRequirements {
TensorBlockShapeType shape_type; // target block shape
size_t size; // target block size
TensorOpCost cost_per_coeff; // cost of computing a single block element
#ifdef EIGEN_HIPCC
// For HIPCC, we need to explicitly declare as a "device fun", the constructor
// which is implicitly invoked in the "merge" / "any" routines. else HIPCC
// errors out complaining about the lack of a matching constructor
EIGEN_DEVICE_FUNC
TensorBlockResourceRequirements(TensorBlockShapeType shape_type_, size_t size_,
TensorOpCost cost_)
: shape_type(shape_type_), size(size_), cost_per_coeff(cost_)
{}
#endif
template <typename Scalar>
EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(
TensorBlockShapeType shape_type, size_t size_in_bytes,
TensorOpCost cost) {
const size_t size = numext::maxi(size_t(1), size_in_bytes / sizeof(Scalar));
return {shape_type, size, cost};
}
template <typename Scalar>
EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(
TensorBlockShapeType shape_type, size_t size_in_bytes) {
// This default cost per coefficient is valid for most materialized tensor
// block evaluation implementations, because they typically just read
// coefficients from the underlying tensor storage, and write to the tensor
// block buffer (scratch or destination memory, reads and writes have linear
// access pattern). We ignore the fixed cost of block evaluation, because in
// practice it should negligible.
//
// Lazy block evaluation adds the cost of calling a functor for each
// coefficient.
//
// All non-trivial block evaluation implementations must provide their own
// cost approximation (e.g. shuffling inner dimension has a much higher cost
// because it reads memory randomly, although the total number of moved
// bytes is the same).
return withShapeAndSize<Scalar>(shape_type, size_in_bytes,
{/*bytes_loaded=*/sizeof(Scalar),
/*bytes_stored=*/sizeof(Scalar),
/*compute_cycles=*/0});
}
template <typename Scalar>
EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements skewed(
size_t size_in_bytes) {
return withShapeAndSize<Scalar>(TensorBlockShapeType::kSkewedInnerDims,
size_in_bytes);
}
template <typename Scalar>
EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements uniform(
size_t size_in_bytes) {
return withShapeAndSize<Scalar>(TensorBlockShapeType::kUniformAllDims,
size_in_bytes);
}
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE TensorBlockResourceRequirements
merge(const TensorBlockResourceRequirements& lhs,
const TensorBlockResourceRequirements& rhs) {
return {merge(lhs.shape_type, rhs.shape_type), // shape_type
merge(lhs.size, rhs.size), // size
merge(lhs.cost_per_coeff, rhs.cost_per_coeff)}; // cost_per_coeff
}
EIGEN_DEVICE_FUNC TensorBlockResourceRequirements& addCostPerCoeff(
TensorOpCost cost) {
cost_per_coeff += cost;
return *this;
}
// This is a resource requirement that should be returned from expressions
// that do not have any block evaluation preference (e.g. default tensor
// expression with raw buffer access).
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE TensorBlockResourceRequirements any() {
return {TensorBlockShapeType::kUniformAllDims, 1, {0, 0, 0}};
}
private:
using Requirements = TensorBlockResourceRequirements;
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE size_t merge(size_t lhs_size, size_t rhs_size) {
return numext::maxi(lhs_size, rhs_size);
}
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE TensorBlockShapeType
merge(TensorBlockShapeType lhs, TensorBlockShapeType rhs) {
return (lhs == TensorBlockShapeType::kSkewedInnerDims ||
rhs == TensorBlockShapeType::kSkewedInnerDims)
? TensorBlockShapeType::kSkewedInnerDims
: TensorBlockShapeType::kUniformAllDims;
}
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE TensorOpCost merge(TensorOpCost lhs_cost,
TensorOpCost rhs_cost) {
return lhs_cost + rhs_cost;
}
};
// -------------------------------------------------------------------------- //
// TensorBlockDescriptor specifies a block offset within a tensor and the block
// sizes along each of the tensor dimensions.
template <int NumDims, typename IndexType = Eigen::Index>
class TensorBlockDescriptor {
public:
typedef DSizes<IndexType, NumDims> Dimensions;
// If we evaluate a Tensor assignment, and expression on the left, already has
// a memory buffer, then we might do performance optimization, and evaluate
// the root expression directly into the final output memory. Some time it's
// possible to reuse it for materializing subexpressions inside an expression
// tree, to to avoid dynamic memory allocation.
//
// The pointer type of the underlying storage is erased, because passing
// Scalar type through all the expression evaluation layers is way too many
// templates. In practice destination buffer type should always match the
// evaluated expression scalar type.
class DestinationBuffer {
public:
enum DestinationBufferKind : int {
// The above explicit specification of "int" as the enum basetype is
// needed to get around a HIPCC link error ("the field type is not
// amp-compatible")
// which is issued for class members with the enum type.
// TODO(rocm):
// remove the "int" basetype once HIPCC has been fixed to not error out
// in the above scenario.
// Destination buffer is not defined (`m_data` == nullptr).
kEmpty,
// Tensor block defined by an owning tensor block descriptor can fit
// contiguously into the destination buffer. In this case it's safe to
// materialize tensor block in the destination buffer, wrap it in a
// TensorMap, and use to build Eigen expression on top of it.
kContiguous,
// Destination buffer strides do not match strides of the contiguously
// stored block, and it's impossible to define a TensorMap over this
// buffer. However if we are evaluating a root of an expression tree, we
// still can materialize an output into this destination, because we can
// guarantee that no one will ever access it through block API.
//
// In theory it is possible to build valid TensorStriding<TensorMap>
// expression on top of this destination buffer, however it has
// inefficient coeff/packet access, and defeats the purpose of fast block
// evaluation API.
kStrided
};
template <typename Scalar>
Scalar* data() const {
eigen_assert(m_data_type_size == sizeof(Scalar));
return static_cast<Scalar*>(m_data);
}
const Dimensions& strides() const { return m_strides; }
const DestinationBufferKind& kind() const { return m_kind; }
private:
friend class TensorBlockDescriptor;
DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {}
template <typename Scalar>
DestinationBuffer(Scalar* data, const Dimensions& strides,
DestinationBufferKind kind)
: m_data(static_cast<void*>(data)),
m_data_type_size(sizeof(Scalar)),
m_strides(strides),
m_kind(kind) {}
template <int Layout, typename Scalar>
static DestinationBuffer make(const TensorBlockDescriptor& desc,
Scalar* data, const Dimensions& strides) {
return DestinationBuffer(data, strides, kind<Layout>(desc, strides));
}
template <int Layout>
static DestinationBufferKind kind(const TensorBlockDescriptor& desc,
const Dimensions& strides) {
const Dimensions& desc_dims = desc.dimensions();
const Dimensions& desc_strides = internal::strides<Layout>(desc_dims);
for (int i = 0; i < NumDims; ++i) {
if (desc_dims[i] == 1) continue;
if (desc_strides[i] != strides[i]) return kStrided;
}
return kContiguous;
}
// Storage pointer is type erased, to reduce template bloat, but we still
// keep the size of the underlying element type for error checking.
void* m_data;
size_t m_data_type_size;
// Destination buffer dimensions always match the dimensions of a tensor
// block descriptor it belongs to, however strides might be different.
Dimensions m_strides;
DestinationBufferKind m_kind;
};
TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions,
const DestinationBuffer& destination)
: m_offset(offset),
m_dimensions(dimensions),
m_destination(destination) {}
TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions)
: m_offset(offset),
m_dimensions(dimensions),
m_destination(DestinationBuffer()) {}
IndexType offset() const { return m_offset; }
const Dimensions& dimensions() const { return m_dimensions; }
IndexType dimension(int index) const { return m_dimensions[index]; }
IndexType size() const { return array_prod<IndexType>(m_dimensions); }
const DestinationBuffer& destination() const { return m_destination; }
template <int Layout, typename Scalar>
void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides) {
eigen_assert(dst_base != NULL);
m_destination =
DestinationBuffer::template make<Layout>(*this, dst_base, dst_strides);
}
template <int Layout, typename Scalar, typename DstStridesIndexType>
void AddDestinationBuffer(
Scalar* dst_base,
const DSizes<DstStridesIndexType, NumDims>& dst_strides) {
// DSizes constructor will do index type promotion if it's safe.
AddDestinationBuffer<Layout>(dst_base, Dimensions(dst_strides));
}
TensorBlockDescriptor& DropDestinationBuffer() {
m_destination.m_data = NULL;
m_destination.m_kind = DestinationBuffer::kEmpty;
return *this;
}
bool HasDestinationBuffer() const {
return m_destination.kind() != DestinationBuffer::kEmpty;
}
// Returns a copy of `*this` with updated offset.
TensorBlockDescriptor WithOffset(IndexType offset) const {
return TensorBlockDescriptor(offset, m_dimensions, m_destination);
}
private:
// Offset and dimensions are immutable after construction. Block descriptor
// can only be mutated by adding or dropping destination.
const IndexType m_offset;
const Dimensions m_dimensions;
DestinationBuffer m_destination;
};
// -------------------------------------------------------------------------- //
// TensorBlockMapper is responsible for iterating over the blocks of a tensor.
template <int NumDims, int Layout, typename IndexType = Eigen::Index>
class TensorBlockMapper {
typedef TensorBlockDescriptor<NumDims, IndexType> BlockDescriptor;
public:
typedef DSizes<IndexType, NumDims> Dimensions;
TensorBlockMapper() = default;
TensorBlockMapper(const DSizes<IndexType, NumDims>& dimensions,
const TensorBlockResourceRequirements& requirements)
: m_tensor_dimensions(dimensions), m_requirements(requirements) {
// Compute block dimensions and the total number of blocks.
InitializeBlockDimensions();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const {
return m_total_block_count;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize() const {
return m_block_dimensions.TotalSize();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes<IndexType, NumDims>&
blockDimensions() const {
return m_block_dimensions;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor
blockDescriptor(IndexType block_index) const {
static const bool isColMajor = Layout == static_cast<int>(ColMajor);
IndexType offset = 0;
DSizes<IndexType, NumDims> dimensions;
if (NumDims == 0) return BlockDescriptor(offset, dimensions);
// Iterate outer -> inner dimensions.
for (int i = NumDims - 1; i >= 0; --i) {
const int dim = isColMajor ? i : NumDims - i - 1;
const IndexType idx = block_index / m_block_strides[dim];
block_index -= idx * m_block_strides[dim];
const IndexType coord = idx * m_block_dimensions[dim];
dimensions[dim] = numext::mini(m_tensor_dimensions[dim] - coord,
m_block_dimensions[dim]);
offset += coord * m_tensor_strides[dim];
}
return {offset, dimensions};
}
private:
void InitializeBlockDimensions() {
// Requested block shape and size.
const TensorBlockShapeType shape_type = m_requirements.shape_type;
IndexType target_block_size =
numext::maxi<IndexType>(1, static_cast<IndexType>(m_requirements.size));
IndexType tensor_size = m_tensor_dimensions.TotalSize();
// Corner case: one of the dimensions is zero. Logic below is too complex
// to handle this case on a general basis, just use unit block size.
// Note: we must not yield blocks with zero dimensions (recipe for
// overflows/underflows, divisions by zero and NaNs later).
if (tensor_size == 0) {
for (int i = 0; i < NumDims; ++i) {
m_block_dimensions[i] = 1;
}
m_total_block_count = 0;
return;
}
// If tensor fits into a target block size, evaluate it as a single block.
if (tensor_size <= target_block_size) {
m_block_dimensions = m_tensor_dimensions;
m_total_block_count = 1;
// The only valid block index is `0`, and in this case we do not need
// to compute real strides for tensor or blocks (see blockDescriptor).
for (int i = 0; i < NumDims; ++i) {
m_tensor_strides[i] = 0;
m_block_strides[i] = 1;
}
return;
}
static const bool isColMajor = Layout == static_cast<int>(ColMajor);
// Block shape skewed towards inner dimension.
if (shape_type == TensorBlockShapeType::kSkewedInnerDims) {
IndexType coeff_to_allocate = target_block_size;
for (int i = 0; i < NumDims; ++i) {
const int dim = isColMajor ? i : NumDims - i - 1;
m_block_dimensions[dim] =
numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]);
coeff_to_allocate = divup(
coeff_to_allocate,
numext::maxi(static_cast<IndexType>(1), m_block_dimensions[dim]));
}
eigen_assert(coeff_to_allocate == 1);
} else if (shape_type == TensorBlockShapeType::kUniformAllDims) {
// Tensor will not fit within 'target_block_size' budget: calculate tensor
// block dimension sizes based on "square" dimension size target.
const IndexType dim_size_target = convert_index<IndexType>(
std::pow(static_cast<float>(target_block_size),
1.0f / static_cast<float>(m_block_dimensions.rank())));
for (int i = 0; i < NumDims; ++i) {
// TODO(andydavis) Adjust the inner most 'block_dim_size' to make it
// a multiple of the packet size. Note that reducing
// 'block_dim_size' in this manner can increase the number of
// blocks, and so will amplify any per-block overhead.
m_block_dimensions[i] =
numext::mini(dim_size_target, m_tensor_dimensions[i]);
}
// Add any un-allocated coefficients to inner dimension(s).
IndexType total_size = m_block_dimensions.TotalSize();
for (int i = 0; i < NumDims; ++i) {
const int dim = isColMajor ? i : NumDims - i - 1;
if (m_block_dimensions[dim] < m_tensor_dimensions[dim]) {
const IndexType total_size_other_dims =
total_size / m_block_dimensions[dim];
const IndexType alloc_avail =
divup<IndexType>(target_block_size, total_size_other_dims);
if (alloc_avail == m_block_dimensions[dim]) {
// Insufficient excess coefficients to allocate.
break;
}
m_block_dimensions[dim] =
numext::mini(m_tensor_dimensions[dim], alloc_avail);
total_size = total_size_other_dims * m_block_dimensions[dim];
}
}
} else {
eigen_assert(false); // unknown block shape
}
eigen_assert(m_block_dimensions.TotalSize() >=
numext::mini<IndexType>(target_block_size,
m_tensor_dimensions.TotalSize()));
// Calculate block counts by dimension and total block count.
DSizes<IndexType, NumDims> block_count;
for (int i = 0; i < NumDims; ++i) {
block_count[i] = divup(m_tensor_dimensions[i], m_block_dimensions[i]);
}
m_total_block_count = array_prod(block_count);
// Calculate block strides (used for enumerating blocks).
m_tensor_strides = strides<Layout>(m_tensor_dimensions);
m_block_strides = strides<Layout>(block_count);
}
DSizes<IndexType, NumDims> m_tensor_dimensions;
TensorBlockResourceRequirements m_requirements;
DSizes<IndexType, NumDims> m_block_dimensions;
IndexType m_total_block_count;
DSizes<IndexType, NumDims> m_tensor_strides;
DSizes<IndexType, NumDims> m_block_strides;
};
// -------------------------------------------------------------------------- //
// TensorBlockScratchAllocator is responsible for allocating temporary buffers
// for block evaluation (output or input block materialization). Given that
// Eigen expression traversal order is deterministic, all temporary allocations
// are happening in the same order, and usually have exactly the same size.
// Scratch allocator keeps a trace of all dynamic allocations, and after the
// first block evaluation is completed, we should be able to reuse all the
// temporary buffers for the next block evaluation.
template <typename Device>
class TensorBlockScratchAllocator {
public:
explicit TensorBlockScratchAllocator(const Device& device)
: m_device(device), m_allocation_index(0) {}
~TensorBlockScratchAllocator() {
for (size_t i = 0; i < m_allocations.size(); ++i) {
m_device.deallocate(m_allocations[i].ptr);
}
}
void* allocate(size_t size) {
// TODO(ezhulenev): Remove when replaced with inlined vector.
if (m_allocations.capacity() == 0) m_allocations.reserve(8);
// Check if we already have an existing allocation att current index.
const int num_allocations = static_cast<int>(m_allocations.size());
const bool has_allocation = m_allocation_index < num_allocations;
// Allocation index can't be larger than the number of allocations.
eigen_assert(m_allocation_index <= num_allocations);
// If we have existing allocation, and its size is larger or equal to
// requested size, we do nothing.
// If current allocation can't fit requested size, we deallocate it, and
// replace with a larger allocation.
if (has_allocation && m_allocations[m_allocation_index].size < size) {
m_device.deallocate(m_allocations[m_allocation_index].ptr);
m_allocations[m_allocation_index].ptr = m_device.allocate(size);
m_allocations[m_allocation_index].size = size;
}
// Make a new allocation if we don't have and existing one.
if (!has_allocation) {
Allocation allocation;
allocation.ptr = m_device.allocate(size);
allocation.size = size;
m_allocations.push_back(allocation);
}
eigen_assert(m_allocations[m_allocation_index].ptr != NULL);
eigen_assert(m_allocations[m_allocation_index].size >= size);
return m_allocations[m_allocation_index++].ptr;
}
void reset() { m_allocation_index = 0; }
private:
struct Allocation {
void* ptr;
size_t size;
};
const Device& m_device;
int m_allocation_index;
// TODO(ezhulenev): This should be an inlined vector.
std::vector<Allocation> m_allocations;
};
// -------------------------------------------------------------------------- //
// TensorBlockKind represents all possible block kinds, that can be produced by
// TensorEvaluator::evalBlock function.
enum TensorBlockKind {
// Tensor block that is a lazy expression that must be assigned to a
// destination using TensorBlockAssign.
kExpr,
// Tensor block that is a view into a memory buffer owned by an underlying
// Tensor expression (e.g. it can be a view into a Tensor buffer).
kView,
// Tensor block that was materialized in a scratch memory buffer, allocated
// with TensorBlockScratchAllocator. This block must be copied to a
// destination, similar to a block of `kExpr` type.
kMaterializedInScratch,
// Tensor block that was materialized directly into the final output memory
// buffer. For example if the left side of an assignment is a Tensor, we can
// directly materialize the block in the destination memory.
//
// If strides in the output buffer do not match tensor block strides, the
// Tensor expression will be invalid, and should not be used by
// TensorBlockAssign or for constructing another block expression.
kMaterializedInOutput
};
// -------------------------------------------------------------------------- //
// TensorBlockNotImplemented should be used to defined TensorBlock typedef in
// TensorEvaluators that do not support block evaluation.
class TensorBlockNotImplemented {
public:
typedef void XprType;
};
// -------------------------------------------------------------------------- //
// XprScalar extracts Scalar type from the Eigen expressions (if expression type
// is not void). It's required to be able to define lazy block expression for
// argument types, that do not support block evaluation.
template <typename XprType>
struct XprScalar {
typedef typename XprType::Scalar type;
};
template <>
struct XprScalar<void> {
typedef void type;
};
// -------------------------------------------------------------------------- //
// TensorMaterializedBlock is a fully evaluated block of the original tensor,
// and XprType is just a TensorMap over the data. This block type is typically
// used to materialize blocks of tensor expressions, that can't be efficiently
// represented as lazy Tensor expressions with fast coeff/packet operations,
// e.g. we materialize all broadcasts into evaluated blocks.
//
// TensorMaterializedBlock does not own its memory buffer, it's either a memory
// buffer that backs the original expression (e.g. block is just a view into a
// Tensor), or a memory buffer allocated with scratch allocator, and in this
// case the scratch allocator will deallocate it at the end of block based
// expression execution.
//
// If the block was evaluated directly into the output buffer, and strides in
// the output buffer do not match block strides, the TensorMap expression will
// be invalid, and should never be used in block assignment or any other tensor
// expression.
template <typename Scalar, int NumDims, int Layout,
typename IndexType = Eigen::Index>
class TensorMaterializedBlock {
public:
typedef DSizes<IndexType, NumDims> Dimensions;
typedef TensorMap<const Tensor<Scalar, NumDims, Layout> > XprType;
TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data,
const Dimensions& dimensions, bool valid_expr = true)
: m_kind(kind),
m_data(data),
m_dimensions(dimensions),
m_expr(m_data, m_dimensions),
m_valid_expr(valid_expr) {
eigen_assert(m_kind == internal::TensorBlockKind::kView ||
m_kind == internal::TensorBlockKind::kMaterializedInScratch ||
m_kind == internal::TensorBlockKind::kMaterializedInOutput);
}
TensorBlockKind kind() const { return m_kind; }
// NOTE(ezhulenev): Returning XprType by value like in other block types
// causes asan failures. The theory is that XprType::Nested doesn't work
// properly for TensorMap.
const XprType& expr() const {
eigen_assert(m_valid_expr);
return m_expr;
}
const Scalar* data() const { return m_data; }
void cleanup() {}
typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
// TensorMaterializedBlock can be backed by different types of storage:
//
// (1) Contiguous block of memory allocated with scratch allocator.
// (2) Contiguous block of memory reused from tensor block descriptor
// destination buffer.
// (3) Strided block of memory reused from tensor block descriptor
// destination buffer.
//
class Storage {
public:
Scalar* data() const { return m_data; }
const Dimensions& dimensions() const { return m_dimensions; }
const Dimensions& strides() const { return m_strides; }
TensorMaterializedBlock AsTensorMaterializedBlock() const {
return TensorMaterializedBlock(
m_materialized_in_output
? internal::TensorBlockKind::kMaterializedInOutput
: internal::TensorBlockKind::kMaterializedInScratch,
m_data, m_dimensions, !m_strided_storage);
}
private:
friend class TensorMaterializedBlock;
Storage(Scalar* data, const Dimensions& dimensions,
const Dimensions& strides, bool materialized_in_output,
bool strided_storage)
: m_data(data),
m_dimensions(dimensions),
m_strides(strides),
m_materialized_in_output(materialized_in_output),
m_strided_storage(strided_storage) {}
Scalar* m_data;
Dimensions m_dimensions;
Dimensions m_strides;
bool m_materialized_in_output;
bool m_strided_storage;
};
// Creates a storage for materialized block either from the block descriptor
// destination buffer, or allocates a new buffer with scratch allocator.
template <typename TensorBlockScratch>
EIGEN_STRONG_INLINE static Storage prepareStorage(
TensorBlockDesc& desc, TensorBlockScratch& scratch,
bool allow_strided_storage = false) {
// Try to reuse destination as an output block buffer.
typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer;
if (desc.destination().kind() == DestinationBuffer::kContiguous) {
Scalar* buffer = desc.destination().template data<Scalar>();
desc.DropDestinationBuffer();
return Storage(buffer, desc.dimensions(),
internal::strides<Layout>(desc.dimensions()),
/*materialized_in_output=*/true,
/*strided_storage=*/false);
} else if (desc.destination().kind() == DestinationBuffer::kStrided &&
allow_strided_storage) {
Scalar* buffer = desc.destination().template data<Scalar>();
desc.DropDestinationBuffer();
return Storage(buffer, desc.dimensions(), desc.destination().strides(),
/*materialized_in_output=*/true, /*strided_storage=*/true);
} else {
void* mem = scratch.allocate(desc.size() * sizeof(Scalar));
return Storage(static_cast<Scalar*>(mem), desc.dimensions(),
internal::strides<Layout>(desc.dimensions()),
/*materialized_in_output=*/false,
/*strided_storage=*/false);
}
}
// Creates a materialized block for the given descriptor from a memory buffer.
template <typename DataDimensions, typename TensorBlockScratch>
EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize(
const Scalar* data, const DataDimensions& data_dims,
TensorBlockDesc& desc, TensorBlockScratch& scratch) {
eigen_assert(array_size<DataDimensions>::value == desc.dimensions().size());
// If a tensor block dimensions covers a contiguous block of the underlying
// memory, we can skip block buffer memory allocation, and construct a block
// from existing `data` memory buffer.
//
// Example: (RowMajor layout)
// data_dims: [11, 12, 13, 14]
// desc.dimensions(): [1, 1, 3, 14]
//
// In this case we can construct a TensorBlock starting at
// `data + desc.offset()`, with a `desc.dimensions()` block sizes.
static const bool is_col_major = Layout == ColMajor;
// Find out how many inner dimensions have a matching size.
int num_matching_inner_dims = 0;
for (int i = 0; i < NumDims; ++i) {
int dim = is_col_major ? i : NumDims - i - 1;
if (data_dims[dim] != desc.dimensions()[dim]) break;
++num_matching_inner_dims;
}
// All the outer dimensions must be of size `1`, except a single dimension
// before the matching inner dimension (`3` in the example above).
bool can_use_direct_access = true;
for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) {
int dim = is_col_major ? i : NumDims - i - 1;
if (desc.dimension(dim) != 1) {
can_use_direct_access = false;
break;
}
}
if (can_use_direct_access) {
const Scalar* block_start = data + desc.offset();
return TensorMaterializedBlock(internal::TensorBlockKind::kView,
block_start, desc.dimensions());
} else {
// Reuse destination buffer or allocate new buffer with scratch allocator.
const Storage storage = prepareStorage(desc, scratch);
typedef internal::TensorBlockIO<Scalar, IndexType, NumDims, Layout>
TensorBlockIO;
typedef typename TensorBlockIO::Dst TensorBlockIODst;
typedef typename TensorBlockIO::Src TensorBlockIOSrc;
TensorBlockIOSrc src(internal::strides<Layout>(Dimensions(data_dims)),
data, desc.offset());
TensorBlockIODst dst(storage.dimensions(), storage.strides(),
storage.data());
TensorBlockIO::Copy(dst, src);
return storage.AsTensorMaterializedBlock();
}
}
private:
TensorBlockKind m_kind;
const Scalar* m_data;
Dimensions m_dimensions;
XprType m_expr;
bool m_valid_expr;
};
// -------------------------------------------------------------------------- //
// TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp
// functor to the blocks produced by the underlying Tensor expression.
template <typename UnaryOp, typename ArgTensorBlock>
class TensorCwiseUnaryBlock {
static const bool NoArgBlockAccess =
internal::is_void<typename ArgTensorBlock::XprType>::value;
public:
typedef typename conditional<
NoArgBlockAccess, void,
TensorCwiseUnaryOp<UnaryOp, const typename ArgTensorBlock::XprType> >::
type XprType;
typedef typename XprScalar<XprType>::type Scalar;
TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor)
: m_arg_block(arg_block), m_functor(functor) {}
TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
XprType expr() const { return XprType(m_arg_block.expr(), m_functor); }
const Scalar* data() const { return NULL; }
void cleanup() { m_arg_block.cleanup(); }
private:
ArgTensorBlock m_arg_block;
UnaryOp m_functor;
};
// -------------------------------------------------------------------------- //
// TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp
// functor to the blocks produced by the underlying Tensor expression.
template <typename BinaryOp, typename LhsTensorBlock, typename RhsTensorBlock>
class TensorCwiseBinaryBlock {
static const bool NoArgBlockAccess =
internal::is_void<typename LhsTensorBlock::XprType>::value ||
internal::is_void<typename RhsTensorBlock::XprType>::value;
public:
typedef typename conditional<
NoArgBlockAccess, void,
TensorCwiseBinaryOp<BinaryOp, const typename LhsTensorBlock::XprType,
const typename RhsTensorBlock::XprType> >::type
XprType;
typedef typename XprScalar<XprType>::type Scalar;
TensorCwiseBinaryBlock(const LhsTensorBlock& left_block,
const RhsTensorBlock& right_block,
const BinaryOp& functor)
: m_left_block(left_block),
m_right_block(right_block),
m_functor(functor) {}
TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
XprType expr() const {
return XprType(m_left_block.expr(), m_right_block.expr(), m_functor);
}
const Scalar* data() const { return NULL; }
void cleanup() {
m_left_block.cleanup();
m_right_block.cleanup();
}
private:
LhsTensorBlock m_left_block;
RhsTensorBlock m_right_block;
BinaryOp m_functor;
};
// -------------------------------------------------------------------------- //
// TensorUnaryExprBlock is a lazy tensor expression block that can construct
// an arbitrary tensor expression from a block of the underlying type (this is a
// generalization of the TensorCwiseUnaryBlock for arbitrary expressions).
template <typename BlockFactory, typename ArgTensorBlock>
class TensorUnaryExprBlock {
typedef typename ArgTensorBlock::XprType ArgXprType;
static const bool NoArgBlockAccess = internal::is_void<ArgXprType>::value;
public:
typedef typename conditional<
NoArgBlockAccess, void,
typename BlockFactory::template XprType<ArgXprType>::type>::type XprType;
typedef typename XprScalar<XprType>::type Scalar;
TensorUnaryExprBlock(const ArgTensorBlock& arg_block,
const BlockFactory& factory)
: m_arg_block(arg_block), m_factory(factory) {}
TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
XprType expr() const { return m_factory.expr(m_arg_block.expr()); }
const Scalar* data() const { return NULL; }
void cleanup() { m_arg_block.cleanup(); }
private:
ArgTensorBlock m_arg_block;
BlockFactory m_factory;
};
// -------------------------------------------------------------------------- //
// TensorTernaryExprBlock is a lazy tensor expression block that can construct
// an arbitrary tensor expression from three blocks of the underlying type.
template <typename BlockFactory, typename Arg1TensorBlock,
typename Arg2TensorBlock, typename Arg3TensorBlock>
class TensorTernaryExprBlock {
typedef typename Arg1TensorBlock::XprType Arg1XprType;
typedef typename Arg2TensorBlock::XprType Arg2XprType;
typedef typename Arg3TensorBlock::XprType Arg3XprType;
static const bool NoArgBlockAccess = internal::is_void<Arg1XprType>::value ||
internal::is_void<Arg2XprType>::value ||
internal::is_void<Arg3XprType>::value;
public:
typedef typename conditional<
NoArgBlockAccess, void,
typename BlockFactory::template XprType<Arg1XprType, Arg2XprType,
Arg3XprType>::type>::type XprType;
typedef typename XprScalar<XprType>::type Scalar;
TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block,
const Arg2TensorBlock& arg2_block,
const Arg3TensorBlock& arg3_block,
const BlockFactory& factory)
: m_arg1_block(arg1_block),
m_arg2_block(arg2_block),
m_arg3_block(arg3_block),
m_factory(factory) {}
TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
XprType expr() const {
return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(),
m_arg3_block.expr());
}
const Scalar* data() const { return NULL; }
void cleanup() {
m_arg1_block.cleanup();
m_arg2_block.cleanup();
m_arg3_block.cleanup();
}
private:
Arg1TensorBlock m_arg1_block;
Arg2TensorBlock m_arg2_block;
Arg3TensorBlock m_arg3_block;
BlockFactory m_factory;
};
// -------------------------------------------------------------------------- //
// StridedLinearBufferCopy provides a method to copy data between two linear
// buffers with different strides, with optimized paths for scatter/gather.
template <typename Scalar, typename IndexType>
class StridedLinearBufferCopy {
typedef typename packet_traits<Scalar>::type Packet;
enum {
Vectorizable = packet_traits<Scalar>::Vectorizable,
PacketSize = packet_traits<Scalar>::size
};
public:
// Specifying linear copy kind statically gives ~30% speedup for small sizes.
enum class Kind {
Linear = 0, // src_stride == 1 && dst_stride == 1
Scatter = 1, // src_stride == 1 && dst_stride != 1
FillLinear = 2, // src_stride == 0 && dst_stride == 1
FillScatter = 3, // src_stride == 0 && dst_stride != 1
Gather = 4, // dst_stride == 1
Random = 5 // everything else
};
struct Dst {
Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {}
IndexType offset;
IndexType stride;
Scalar* data;
};
struct Src {
Src(IndexType o, IndexType s, const Scalar* d)
: offset(o), stride(s), data(d) {}
IndexType offset;
IndexType stride;
const Scalar* data;
};
template <typename StridedLinearBufferCopy::Kind kind>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst,
const Src& src,
const size_t count) {
Run<kind>(count, dst.offset, dst.stride, dst.data, src.offset, src.stride,
src.data);
}
private:
template <typename StridedLinearBufferCopy::Kind kind>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
const IndexType count, const IndexType dst_offset,
const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data,
const IndexType src_offset, const IndexType src_stride,
const Scalar* EIGEN_RESTRICT src_data) {
const Scalar* src = &src_data[src_offset];
Scalar* dst = &dst_data[dst_offset];
if (!Vectorizable) {
for (Index i = 0; i < count; ++i) {
dst[i * dst_stride] = src[i * src_stride];
}
return;
}
const IndexType vectorized_size = count - PacketSize;
IndexType i = 0;
if (kind == StridedLinearBufferCopy::Kind::Linear) {
// ******************************************************************** //
// Linear copy from `src` to `dst`.
const IndexType unrolled_size = count - 4 * PacketSize;
eigen_assert(src_stride == 1 && dst_stride == 1);
for (; i <= unrolled_size; i += 4 * PacketSize) {
for (int j = 0; j < 4; ++j) {
Packet p = ploadu<Packet>(src + i + j * PacketSize);
pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
}
}
for (; i <= vectorized_size; i += PacketSize) {
Packet p = ploadu<Packet>(src + i);
pstoreu<Scalar, Packet>(dst + i, p);
}
for (; i < count; ++i) {
dst[i] = src[i];
}
// ******************************************************************** //
} else if (kind == StridedLinearBufferCopy::Kind::Scatter) {
// Scatter from `src` to `dst`.
eigen_assert(src_stride == 1 && dst_stride != 1);
for (; i <= vectorized_size; i += PacketSize) {
Packet p = ploadu<Packet>(src + i);
pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
}
for (; i < count; ++i) {
dst[i * dst_stride] = src[i];
}
// ******************************************************************** //
} else if (kind == StridedLinearBufferCopy::Kind::FillLinear) {
// Fill `dst` with value at `*src`.
eigen_assert(src_stride == 0 && dst_stride == 1);
const IndexType unrolled_size = count - 4 * PacketSize;
Packet p = pload1<Packet>(src);
for (; i <= unrolled_size; i += 4 * PacketSize) {
for (int j = 0; j < 4; ++j) {
pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
}
}
for (; i <= vectorized_size; i += PacketSize) {
pstoreu<Scalar, Packet>(dst + i, p);
}
for (; i < count; ++i) {
dst[i] = *src;
}
// ******************************************************************** //
} else if (kind == StridedLinearBufferCopy::Kind::FillScatter) {
// Scatter `*src` into `dst`.
eigen_assert(src_stride == 0 && dst_stride != 1);
Packet p = pload1<Packet>(src);
for (; i <= vectorized_size; i += PacketSize) {
pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
}
for (; i < count; ++i) {
dst[i * dst_stride] = *src;
}
// ******************************************************************** //
} else if (kind == StridedLinearBufferCopy::Kind::Gather) {
// Gather from `src` into `dst`.
eigen_assert(dst_stride == 1);
for (; i <= vectorized_size; i += PacketSize) {
Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
pstoreu<Scalar, Packet>(dst + i, p);
}
for (; i < count; ++i) {
dst[i] = src[i * src_stride];
}
// ******************************************************************** //
} else if (kind == StridedLinearBufferCopy::Kind::Random) {
// Random.
for (; i < count; ++i) {
dst[i * dst_stride] = src[i * src_stride];
}
} else {
eigen_assert(false);
}
}
};
// -------------------------------------------------------------------------- //
// TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block.
// It's possible to specify src->dst dimension mapping for the copy operation.
// Dimensions of `dst` specify how many elements have to be copied, for the
// `src` we need to know only stride to navigate through source memory buffer.
template <typename Scalar, typename IndexType, int NumDims, int Layout>
class TensorBlockIO {
static const bool IsColMajor = (Layout == ColMajor);
typedef StridedLinearBufferCopy<Scalar, IndexType> LinCopy;
public:
typedef DSizes<IndexType, NumDims> Dimensions;
typedef DSizes<int, NumDims> DimensionsMap;
struct Dst {
Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst,
IndexType dst_offset = 0)
: dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {}
Dimensions dims;
Dimensions strides;
Scalar* data;
IndexType offset;
};
struct Src {
Src(const Dimensions& src_strides, const Scalar* src,
IndexType src_offset = 0)
: strides(src_strides), data(src), offset(src_offset) {}
Dimensions strides;
const Scalar* data;
IndexType offset;
};
// Copies data to `dst` from `src`, using provided dimensions mapping:
//
// src_dimension_index = dst_to_src_dim_map[dst_dimension_index]
//
// Returns the number of copied elements.
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy(
const Dst& dst, const Src& src, const DimensionsMap& dst_to_src_dim_map) {
// Copy single scalar value from `src` to `dst`.
if (NumDims == 0) {
*(dst.data + dst.offset) = *(src.data + src.offset);
return 1;
}
// Both `dst` and `src` must have contiguous innermost dimension. We also
// accept the special case with stride '0', because it's used as a trick to
// implement broadcasting.
{
int inner_dim = IsColMajor ? 0 : NumDims - 1;
EIGEN_UNUSED_VARIABLE(inner_dim);
eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0);
eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0);
}
// Give a shorter name to `dst_to_src_dim_map`.
const DimensionsMap& dim_map = dst_to_src_dim_map;
// Do not squeeze reordered inner dimensions.
int num_squeezable_dims = NumSqueezableInnerDims(dim_map);
// NOTE: We find the innermost dimension (contiguous in memory) in the dst
// block, and we write data linearly into that dimension, reading it from
// the src. If dimensions are reordered, we might end up reading data from
// the src with `stride != 1`.
//
// NOTE: Random-Read/Linear-Write can be up to ~2X faster than
// Linear-Read/Random-Write: https://stackoverflow.com/a/54935680
// Find the innermost dimension in the dst whose size is not 1. This is the
// effective inner dim.
int num_size_one_inner_dims = 0;
for (int i = 0; i < num_squeezable_dims; ++i) {
const int dst_dim = IsColMajor ? i : NumDims - i - 1;
if (dst.dims[dst_dim] != 1) break;
num_size_one_inner_dims++;
}
// If all dimensions are of size 1, just copy a scalar from `src` to `dst`.
if (num_size_one_inner_dims == NumDims) {
*(dst.data + dst.offset) = *(src.data + src.offset);
return 1;
}
// Outermost dimension in the dst with `stride == 1` (contiguous in memory).
const int dst_stride1_dim = IsColMajor
? num_size_one_inner_dims
: NumDims - num_size_one_inner_dims - 1;
// Dimension in the src that corresponds to the dst innermost dimension.
const int src_dim_for_dst_stride1_dim =
NumDims == 0 ? 1 : dim_map[dst_stride1_dim];
// Size of the innermost dimension (length of contiguous blocks of memory).
IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim];
// Squeeze multiple inner dims into one if they are contiguous in `dst` and
// `src` memory, so we can do less linear copy calls.
for (int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) {
const int dst_dim = IsColMajor ? i : NumDims - i - 1;
const IndexType dst_stride = dst.strides[dst_dim];
const IndexType src_stride = src.strides[dim_map[dst_dim]];
if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) {
dst_inner_dim_size *= dst.dims[dst_dim];
++num_size_one_inner_dims;
} else {
break;
}
}
// Setup strides to read data from `src` and write to `dst`.
IndexType input_offset = src.offset;
IndexType output_offset = dst.offset;
IndexType input_stride =
NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim];
IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim];
const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
array<BlockIteratorState, at_least_1_dim> it;
// Initialize block iterator state. Squeeze away any dimension of size 1.
int idx = 0; // currently initialized iterator state index
for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2;
if (dst.dims[dst_dim] == 1) continue;
it[idx].size = dst.dims[dst_dim];
it[idx].input_stride = src.strides[dim_map[dst_dim]];
it[idx].output_stride = dst.strides[dst_dim];
it[idx].input_span = it[idx].input_stride * (it[idx].size - 1);
it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
idx++;
}
// Iterate copying data from src to dst.
const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize();
#define COPY_INNER_DIM(KIND) \
IndexType num_copied = 0; \
for (num_copied = 0; num_copied < block_total_size; \
num_copied += dst_inner_dim_size) { \
LinCopy::template Run<KIND>( \
typename LinCopy::Dst(output_offset, output_stride, dst.data), \
typename LinCopy::Src(input_offset, input_stride, src.data), \
dst_inner_dim_size); \
\
for (int j = 0; j < idx; ++j) { \
if (++it[j].count < it[j].size) { \
input_offset += it[j].input_stride; \
output_offset += it[j].output_stride; \
break; \
} \
it[j].count = 0; \
input_offset -= it[j].input_span; \
output_offset -= it[j].output_span; \
} \
} \
return num_copied;
if (input_stride == 1 && output_stride == 1) {
COPY_INNER_DIM(LinCopy::Kind::Linear);
} else if (input_stride == 1 && output_stride != 1) {
COPY_INNER_DIM(LinCopy::Kind::Scatter);
} else if (input_stride == 0 && output_stride == 1) {
COPY_INNER_DIM(LinCopy::Kind::FillLinear);
} else if (input_stride == 0 && output_stride != 1) {
COPY_INNER_DIM(LinCopy::Kind::FillScatter);
} else if (output_stride == 1) {
COPY_INNER_DIM(LinCopy::Kind::Gather);
} else {
COPY_INNER_DIM(LinCopy::Kind::Random);
}
#undef COPY_INNER_DIM
}
// Copy from `src` to `dst` with an identity src->dst dimension map. Returns
// the number of copied elements.
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst& dst,
const Src& src) {
DimensionsMap dst_to_src_map;
for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i;
return Copy(dst, src, dst_to_src_map);
}
private:
struct BlockIteratorState {
BlockIteratorState()
: size(0),
count(0),
input_stride(0),
output_stride(0),
input_span(0),
output_span(0) {}
IndexType size;
IndexType count;
IndexType input_stride;
IndexType output_stride;
IndexType input_span;
IndexType output_span;
};
// Compute how many inner dimensions it's allowed to squeeze when doing IO
// between two tensor blocks. It's safe to squeeze inner dimensions, only
// if they are not reordered.
static int NumSqueezableInnerDims(const DimensionsMap& dim_map) {
int num_squeezable_dims = 0;
for (int i = 0; i < NumDims; ++i) {
const int dim = IsColMajor ? i : NumDims - i - 1;
if (dim_map[dim] != dim) break;
num_squeezable_dims++;
}
return num_squeezable_dims;
}
};
// -------------------------------------------------------------------------- //
// TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to
// a Tensor block defined by `desc`, backed by a memory buffer at `target`.
//
// Currently there is no way to write from a Tensor expression to a block of
// memory, if dimensions are reordered. If you need to do that, you should
// materialize a Tensor block expression into a memory buffer, and then use
// TensorBlockIO to copy data between two memory buffers with a custom
// `target->src` dimension map (see definition above).
//
// Also currently the innermost dimension of `target` must have a stride '1'
// (contiguous in memory). This restriction could be lifted with a `pscatter`,
// but in practice it's never needed, and there is a similar TensorBlockIO
// workaround for that.
//
// TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO
// where `src` is a tensor expression. Explore if it is possible to rewrite IO
// to use expressions instead of pointers, and after that TensorBlockAssignment
// will become an alias to IO.
template <typename Scalar, int NumDims, typename TensorBlockExpr,
typename IndexType = Eigen::Index>
class TensorBlockAssignment {
// We will use coeff/packet path to evaluate block expressions.
typedef TensorEvaluator<const TensorBlockExpr, DefaultDevice>
TensorBlockEvaluator;
typedef DSizes<IndexType, NumDims> Dimensions;
enum {
Vectorizable = packet_traits<Scalar>::Vectorizable,
PacketSize = packet_traits<Scalar>::size
};
template <bool Vectorizable, typename Evaluator>
struct InnerDimAssign {
EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
const Evaluator& eval,
IndexType eval_offset) {
for (IndexType i = 0; i < count; ++i) {
target[i] = eval.coeff(eval_offset + i);
}
}
};
template <typename Evaluator>
struct InnerDimAssign<true, Evaluator> {
EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
const Evaluator& eval,
IndexType eval_offset) {
typedef typename packet_traits<Scalar>::type Packet;
const IndexType unrolled_size = count - 4 * PacketSize;
const IndexType vectorized_size = count - PacketSize;
IndexType i = 0;
for (; i <= unrolled_size; i += 4 * PacketSize) {
for (int j = 0; j < 4; ++j) {
const IndexType idx = eval_offset + i + j * PacketSize;
Packet p = eval.template packet<Unaligned>(idx);
pstoreu<Scalar>(target + i + j * PacketSize, p);
}
}
for (; i <= vectorized_size; i += PacketSize) {
Packet p = eval.template packet<Unaligned>(eval_offset + i);
pstoreu<Scalar>(target + i, p);
}
for (; i < count; ++i) {
target[i] = eval.coeff(eval_offset + i);
}
}
};
public:
struct Target {
Target(const Dimensions& target_dims, const Dimensions& target_strides,
Scalar* target_data, IndexType target_offset = 0)
: dims(target_dims),
strides(target_strides),
data(target_data),
offset(target_offset) {}
Dimensions dims;
Dimensions strides;
Scalar* data;
IndexType offset;
};
static Target target(const Dimensions& target_dims,
const Dimensions& target_strides, Scalar* target_data,
IndexType target_offset = 0) {
return Target(target_dims, target_strides, target_data, target_offset);
}
template <typename TargetDimsIndexType, typename TargetStridesIndexType>
static Target target(
const DSizes<TargetDimsIndexType, NumDims>& target_dims,
const DSizes<TargetStridesIndexType, NumDims>& target_strides,
Scalar* target_data, IndexType target_offset = 0) {
// DSizes constructor will do index type promotion if it's safe.
return Target(Dimensions(target_dims), Dimensions(target_strides),
target_data, target_offset);
}
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
const Target& target, const TensorBlockExpr& expr) {
// Prepare evaluator for block expression.
DefaultDevice default_device;
TensorBlockEvaluator eval(expr, default_device);
// Tensor block expression dimension should match destination dimensions.
eigen_assert(dimensions_match(target.dims, eval.dimensions()));
static const int Layout = TensorBlockEvaluator::Layout;
static const bool is_col_major = Layout == ColMajor;
// Initialize output inner dimension size based on a layout.
const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize();
const int inner_dim_idx = is_col_major ? 0 : NumDims - 1;
IndexType output_inner_dim_size = target.dims[inner_dim_idx];
// Target inner dimension stride must be '1'.
eigen_assert(target.strides[inner_dim_idx] == 1);
// Squeeze multiple inner dims into one if they are contiguous in `target`.
IndexType num_squeezed_dims = 0;
for (Index i = 1; i < NumDims; ++i) {
const Index dim = is_col_major ? i : NumDims - i - 1;
const IndexType target_stride = target.strides[dim];
if (output_inner_dim_size == target_stride) {
output_inner_dim_size *= target.dims[dim];
num_squeezed_dims++;
} else {
break;
}
}
// Initialize output block iterator state. Dimension in this array are
// always in inner_most -> outer_most order (col major layout).
array<BlockIteratorState, NumDims> it;
int idx = 0; // currently initialized iterator state index
for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) {
const Index dim = is_col_major ? i + 1 : NumDims - i - 2;
it[idx].count = 0;
it[idx].size = target.dims[dim];
it[idx].output_stride = target.strides[dim];
it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
idx++;
}
// We read block expression from the beginning, and start writing data to
// `target` at given offset.
IndexType input_offset = 0;
IndexType output_offset = target.offset;
// Iterate copying data from `eval` to `target`.
for (IndexType i = 0; i < output_size; i += output_inner_dim_size) {
// Assign to `target` at current offset.
InnerDimAssign<Vectorizable && TensorBlockEvaluator::PacketAccess,
TensorBlockEvaluator>::Run(target.data + output_offset,
output_inner_dim_size, eval,
input_offset);
// Move input offset forward by the number of assigned coefficients.
input_offset += output_inner_dim_size;
// Update index.
for (int j = 0; j < idx; ++j) {
if (++it[j].count < it[j].size) {
output_offset += it[j].output_stride;
break;
}
it[j].count = 0;
output_offset -= it[j].output_span;
}
}
}
private:
struct BlockIteratorState {
BlockIteratorState()
: count(0), size(0), output_stride(0), output_span(0) {}
IndexType count;
IndexType size;
IndexType output_stride;
IndexType output_span;
};
};
// -------------------------------------------------------------------------- //
} // namespace internal
} // namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
......@@ -1971,9 +1971,9 @@ class TestPow_factor_tensor(TestActivation):
feed={"x": input},
fetch_list=[out_1, out_2, res, out_6])
assert np.array_equal(res_1, np.power(input, 2))
assert np.array_equal(res_2, np.power(input, 3))
assert np.array_equal(res_6, np.power(input, 3))
assert np.allclose(res_1, np.power(input, 2))
assert np.allclose(res_2, np.power(input, 3))
assert np.allclose(res_6, np.power(input, 3))
def test_error(self):
in1 = fluid.layers.data(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册