Update eigen version to f612df27 (#31832)

* update eigen version to f612df27, test=develop * fix compilation error, test=develop * remove patch command in eigen, test=develop * fix compilation error caused by call Eigen function with float16 and bfloat16, test=develop * fix unittest error, test=develop * fix unittest error caused by precision, test=develop * remove patch files used by old version eigen, test=develop

Update eigen version to f612df27 (#31832)
* update eigen version to f612df27, test=develop * fix compilation error, test=develop * remove patch command in eigen, test=develop * fix compilation error caused by call Eigen function with float16 and bfloat16, test=develop * fix unittest error, test=develop * fix unittest error caused by precision, test=develop * remove patch files used by old version eigen, test=develop
495e7f9c · wuhuanzhou · GitHub · 587d99ae · 495e7f9c · 495e7f9c
11 changed file
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -14,11 +14,11 @@

 include(ExternalProject)

-# update eigen to the commit id 4da2c6b1 on 03/19/2020
+# update eigen to the commit id f612df27 on 03/16/2021
 set(EIGEN_PREFIX_DIR ${THIRD_PARTY_PATH}/eigen3)
 set(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3/src/extern_eigen3)
 set(EIGEN_REPOSITORY https://gitlab.com/libeigen/eigen.git)
-set(EIGEN_TAG        4da2c6b1974827b1999bab652a3d4703e1992d26)
+set(EIGEN_TAG        f612df273689a19d25b45ca4f8269463207c4fee)

 cache_third_party(extern_eigen3
    REPOSITORY    ${EIGEN_REPOSITORY}
@@ -27,48 +27,6 @@ cache_third_party(extern_eigen3

 if(WIN32)
    add_definitions(-DEIGEN_STRONG_INLINE=inline)
-    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Half.h native_src)
-    file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/arch/CUDA/Half.h native_dst)
-    # For Windows
-    # which will cause a compilation error in Tensor:74:
-    # "can not open file 'unistd.h'"
-    # so use following patch to solve compilation error On Windows.
-    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Tensor native_src2)
-    file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/unsupported/Eigen/CXX11/Tensor native_dst2)
-    # For VS2015
-    # which will cause a compilation error in TensorBlock.h:1028:
-    # "syntax error"
-    # so use following patch to solve compilation error On Windows.
-    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/TensorBlock.h native_src3)
-    file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h native_dst3)
-    set(EIGEN_PATCH_COMMAND copy ${native_src} ${native_dst} /Y && copy ${native_src2} ${native_dst2} /Y && copy ${native_src3} ${native_dst3} /Y)
-elseif(LINUX)
-    # For gxx=4.8, __GXX_ABI_VERSION is less than 1004
-    # which will cause a compilation error in Geometry_SSE.h:38:
-    # "no matching function for call to 'pmul(Eigen::internal::Packet4f&, __m128)"
-    # refer to: https://gitlab.com/libeigen/eigen/-/blob/4da2c6b1974827b1999bab652a3d4703e1992d26/Eigen/src/Core/arch/SSE/PacketMath.h#L33-60
-    # add -fabi-version=4 could avoid above error, but will cause "double free corruption" when compile with gcc8
-    # so use following patch to solve compilation error with different version of gcc.
-    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Geometry_SSE.h native_src1)
-    file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Geometry/arch/Geometry_SSE.h native_dst1)
-    # The compiler fully support const expressions since c++14,
-    # but Eigen use some const expressions such as std::max and std::min, which are not supported in c++11
-    # add patch to avoid compilation error in c++11
-    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/MathFunctions.h native_src2)
-    file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/MathFunctions.h native_dst2)
-    if(WITH_ROCM)
-        # For HIPCC Eigen::internal::device::numeric_limits is not EIGEN_DEVICE_FUNC
-        # which will cause compiler error of using __host__ funciont in __host__ __device__
-        file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Meta.h native_src3)
-        file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/util/Meta.h native_dst3)
-        # For HIPCC Eigen::internal::scalar_sum_op<bool,bool> is not EIGEN_DEVICE_FUNC
-        # which will cause compiler error of using __host__ funciont in __host__ __device__
-        file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/BinaryFunctors.h native_src4)
-        file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/functors/BinaryFunctors.h native_dst4)
-        set(EIGEN_PATCH_COMMAND cp ${native_src1} ${native_dst1} && cp ${native_src2} ${native_dst2} && cp ${native_src3} ${native_dst3} && cp ${native_src4} ${native_dst4})
-    else()
-        set(EIGEN_PATCH_COMMAND cp ${native_src1} ${native_dst1} && cp ${native_src2} ${native_dst2})
-    endif()
 endif()

 set(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR})
@@ -82,7 +40,7 @@ ExternalProject_Add(
    PREFIX          ${EIGEN_PREFIX_DIR}
    SOURCE_DIR      ${EIGEN_SOURCE_DIR}
    UPDATE_COMMAND    ""
-    PATCH_COMMAND   ${EIGEN_PATCH_COMMAND}
+    PATCH_COMMAND     ""
    CONFIGURE_COMMAND ""
    BUILD_COMMAND     ""
    INSTALL_COMMAND   ""

--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -400,7 +400,7 @@ struct HardShrinkFunctor : public BaseActivationFunctor<T> {
  void operator()(Device d, X x, Out out) const {
    auto temp1 = x < static_cast<T>(threshold * -1.f);
    auto temp2 = x > static_cast<T>(threshold);
-    out.device(d) = x * (temp1 + temp2 > 0).template cast<T>();
+    out.device(d) = x * (temp1 + temp2).template cast<T>();
  }
 };

@@ -417,7 +417,7 @@ struct HardShrinkGradFunctor : public BaseActivationFunctor<T> {
  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
    auto temp1 = x < static_cast<T>(threshold * -1.f);
    auto temp2 = x > static_cast<T>(threshold);
-    dx.device(d) = dout * (temp1 + temp2 > 0).template cast<T>();
+    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
  }

  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }

--- a/paddle/fluid/platform/eigen_ext.h
+++ b/paddle/fluid/platform/eigen_ext.h
@@ -24,7 +24,6 @@

 namespace Eigen {

-using bfloat16 = paddle::platform::bfloat16;
 using complex64 = paddle::platform::complex64;
 using complex128 = paddle::platform::complex128;
 using float16 = paddle::platform::float16;
@@ -33,7 +32,8 @@ template <typename T>
 struct NumTraits;

 template <>
-struct NumTraits<bfloat16> : GenericNumTraits<bfloat16> {
+struct NumTraits<paddle::platform::bfloat16>
+    : GenericNumTraits<paddle::platform::bfloat16> {
  enum {
    IsSigned = true,
    IsInteger = false,
@@ -41,22 +41,22 @@ struct NumTraits<bfloat16> : GenericNumTraits<bfloat16> {
    RequireInitialization = false
  };

-  HOSTDEVICE static inline bfloat16 epsilon() {
+  HOSTDEVICE static inline paddle::platform::bfloat16 epsilon() {
    return paddle::platform::raw_uint16_to_bfloat16(0x3400);
  }
-  HOSTDEVICE static inline bfloat16 dummy_precision() {
-    return bfloat16(1e-5f);
+  HOSTDEVICE static inline paddle::platform::bfloat16 dummy_precision() {
+    return paddle::platform::bfloat16(1e-5f);
  }
-  HOSTDEVICE static inline bfloat16 highest() {
+  HOSTDEVICE static inline paddle::platform::bfloat16 highest() {
    return paddle::platform::raw_uint16_to_bfloat16(0x7f7f);
  }
-  HOSTDEVICE static inline bfloat16 lowest() {
+  HOSTDEVICE static inline paddle::platform::bfloat16 lowest() {
    return paddle::platform::raw_uint16_to_bfloat16(0xff7f);
  }
-  HOSTDEVICE static inline bfloat16 infinity() {
+  HOSTDEVICE static inline paddle::platform::bfloat16 infinity() {
    return paddle::platform::raw_uint16_to_bfloat16(0x7f80);
  }
-  HOSTDEVICE static inline bfloat16 quiet_NaN() {
+  HOSTDEVICE static inline paddle::platform::bfloat16 quiet_NaN() {
    return paddle::platform::raw_uint16_to_bfloat16(0xffc1);
  }
 };
@@ -137,68 +137,91 @@ namespace numext {
 //////////// bfloat methods /////////////

 template <>
-HOSTDEVICE inline bool(isnan)(const bfloat16& a) {
+HOSTDEVICE inline bool(isnan)(const paddle::platform::bfloat16& a) {
  return (paddle::platform::isnan)(a);
 }

 template <>
-HOSTDEVICE inline bool(isinf)(const bfloat16& a) {
+HOSTDEVICE inline bool(isinf)(const paddle::platform::bfloat16& a) {
  return (paddle::platform::isinf)(a);
 }

 template <>
-HOSTDEVICE inline bool(isfinite)(const bfloat16& a) {
+HOSTDEVICE inline bool(isfinite)(const paddle::platform::bfloat16& a) {
  return (paddle::platform::isfinite)(a);
 }

 template <>
-HOSTDEVICE inline bfloat16 exp(const bfloat16& a) {
-  return bfloat16(::expf(static_cast<float>(a)));
+HOSTDEVICE inline paddle::platform::bfloat16 exp(
+    const paddle::platform::bfloat16& a) {
+  return paddle::platform::bfloat16(::expf(static_cast<float>(a)));
 }

 template <>
-HOSTDEVICE inline bfloat16 erf(const bfloat16& a) {
-  return bfloat16(::erff(static_cast<float>(a)));
+HOSTDEVICE inline paddle::platform::bfloat16 erf(
+    const paddle::platform::bfloat16& a) {
+  return paddle::platform::bfloat16(::erff(static_cast<float>(a)));
 }

 template <>
-HOSTDEVICE inline bfloat16 log(const bfloat16& a) {
-  return bfloat16(::logf(static_cast<float>(a)));
+HOSTDEVICE inline paddle::platform::bfloat16 log(
+    const paddle::platform::bfloat16& a) {
+  return paddle::platform::bfloat16(::logf(static_cast<float>(a)));
 }

 template <>
-HOSTDEVICE inline bfloat16 tanh(const bfloat16& a) {
-  return bfloat16(::tanhf(static_cast<float>(a)));
+HOSTDEVICE inline paddle::platform::bfloat16 tanh(
+    const paddle::platform::bfloat16& a) {
+  return paddle::platform::bfloat16(::tanhf(static_cast<float>(a)));
 }

 template <>
-HOSTDEVICE inline bfloat16 sqrt(const bfloat16& a) {
-  return bfloat16(::sqrtf(static_cast<float>(a)));
+HOSTDEVICE inline paddle::platform::bfloat16 sqrt(
+    const paddle::platform::bfloat16& a) {
+  return paddle::platform::bfloat16(::sqrtf(static_cast<float>(a)));
 }

 template <>
-HOSTDEVICE inline bfloat16 ceil(const bfloat16& a) {
-  return bfloat16(::ceilf(static_cast<float>(a)));
+HOSTDEVICE inline paddle::platform::bfloat16 ceil(
+    const paddle::platform::bfloat16& a) {
+  return paddle::platform::bfloat16(::ceilf(static_cast<float>(a)));
 }

 template <>
-HOSTDEVICE inline bfloat16 floor(const bfloat16& a) {
-  return bfloat16(::floorf(static_cast<float>(a)));
+HOSTDEVICE inline paddle::platform::bfloat16 floor(
+    const paddle::platform::bfloat16& a) {
+  return paddle::platform::bfloat16(::floorf(static_cast<float>(a)));
 }

 template <>
-HOSTDEVICE inline bfloat16 round(const bfloat16& a) {
-  return bfloat16(::roundf(static_cast<float>(a)));
+HOSTDEVICE inline paddle::platform::bfloat16 round(
+    const paddle::platform::bfloat16& a) {
+  return paddle::platform::bfloat16(::roundf(static_cast<float>(a)));
 }

 template <>
-HOSTDEVICE inline bfloat16 pow(const bfloat16& a, const bfloat16& b) {
-  return bfloat16(::powf(static_cast<float>(a), static_cast<float>(b)));
+HOSTDEVICE inline paddle::platform::bfloat16 pow(
+    const paddle::platform::bfloat16& a, const paddle::platform::bfloat16& b) {
+  return paddle::platform::bfloat16(
+      ::powf(static_cast<float>(a), static_cast<float>(b)));
 }

 template <>
-HOSTDEVICE inline bfloat16 abs(const bfloat16& a) {
-  return bfloat16(::fabs(static_cast<float>(a)));
+HOSTDEVICE inline paddle::platform::bfloat16 abs(
+    const paddle::platform::bfloat16& a) {
+  return paddle::platform::bfloat16(::fabs(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline paddle::platform::bfloat16 mini(
+    const paddle::platform::bfloat16& a, const paddle::platform::bfloat16& b) {
+  return b < a ? b : a;
+}
+
+template <>
+HOSTDEVICE inline paddle::platform::bfloat16 maxi(
+    const paddle::platform::bfloat16& a, const paddle::platform::bfloat16& b) {
+  return a < b ? b : a;
 }

 //////////// complex64 methods /////////////
@@ -398,5 +421,15 @@ HOSTDEVICE inline float16 abs(const float16& a) {
  return float16(::fabs(static_cast<float>(a)));
 }

+template <>
+HOSTDEVICE inline float16 mini(const float16& a, const float16& b) {
+  return b < a ? b : a;
+}
+
+template <>
+HOSTDEVICE inline float16 maxi(const float16& a, const float16& b) {
+  return a < b ? b : a;
+}
+
 }  // namespace numext
 }  // namespace Eigen
--- a/patches/eigen/BinaryFunctors.h
+++ b/patches/eigen/BinaryFunctors.h
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// clang-format off
-
-#ifndef EIGEN_BINARY_FUNCTORS_H
-#define EIGEN_BINARY_FUNCTORS_H
-
-namespace Eigen {
-
-namespace internal {
-
-//---------- associative binary functors ----------
-
-template<typename Arg1, typename Arg2>
-struct binary_op_base
-{
-  typedef Arg1 first_argument_type;
-  typedef Arg2 second_argument_type;
-};
-
-/** \internal
-  * \brief Template functor to compute the sum of two scalars
-  *
-  * \sa class CwiseBinaryOp, MatrixBase::operator+, class VectorwiseOp, DenseBase::sum()
-  */
-template<typename LhsScalar,typename RhsScalar>
-struct scalar_sum_op : binary_op_base<LhsScalar,RhsScalar>
-{
-  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_sum_op>::ReturnType result_type;
-#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_sum_op)
-#else
-  scalar_sum_op() {
-    EIGEN_SCALAR_BINARY_OP_PLUGIN
-  }
-#endif
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a + b; }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::padd(a,b); }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
-  { return internal::predux(a); }
-};
-template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_sum_op<LhsScalar,RhsScalar> > {
-  enum {
-    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2, // rough estimate!
-    PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasAdd && packet_traits<RhsScalar>::HasAdd
-    // TODO vectorize mixed sum
-  };
-};
-
-/** \internal
-  * \brief Template specialization to deprecate the summation of boolean expressions.
-  * This is required to solve Bug 426.
-  * \sa DenseBase::count(), DenseBase::any(), ArrayBase::cast(), MatrixBase::cast()
-  */
-template<> struct scalar_sum_op<bool,bool> : scalar_sum_op<int,int> {
-  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC
-  scalar_sum_op() {}
-};
-
-
-/** \internal
-  * \brief Template functor to compute the product of two scalars
-  *
-  * \sa class CwiseBinaryOp, Cwise::operator*(), class VectorwiseOp, MatrixBase::redux()
-  */
-template<typename LhsScalar,typename RhsScalar>
-struct scalar_product_op  : binary_op_base<LhsScalar,RhsScalar>
-{
-  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_product_op>::ReturnType result_type;
-#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op)
-#else
-  scalar_product_op() {
-    EIGEN_SCALAR_BINARY_OP_PLUGIN
-  }
-#endif
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a * b; }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::pmul(a,b); }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
-  { return internal::predux_mul(a); }
-};
-template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_product_op<LhsScalar,RhsScalar> > {
-  enum {
-    Cost = (NumTraits<LhsScalar>::MulCost + NumTraits<RhsScalar>::MulCost)/2, // rough estimate!
-    PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasMul && packet_traits<RhsScalar>::HasMul
-    // TODO vectorize mixed product
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the conjugate product of two scalars
-  *
-  * This is a short cut for conj(x) * y which is needed for optimization purpose; in Eigen2 support mode, this becomes x * conj(y)
-  */
-template<typename LhsScalar,typename RhsScalar>
-struct scalar_conj_product_op  : binary_op_base<LhsScalar,RhsScalar>
-{
-
-  enum {
-    Conj = NumTraits<LhsScalar>::IsComplex
-  };
-  
-  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_conj_product_op>::ReturnType result_type;
-  
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_conj_product_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const
-  { return conj_helper<LhsScalar,RhsScalar,Conj,false>().pmul(a,b); }
-  
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return conj_helper<Packet,Packet,Conj,false>().pmul(a,b); }
-};
-template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_conj_product_op<LhsScalar,RhsScalar> > {
-  enum {
-    Cost = NumTraits<LhsScalar>::MulCost,
-    PacketAccess = internal::is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasMul
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the min of two scalars
-  *
-  * \sa class CwiseBinaryOp, MatrixBase::cwiseMin, class VectorwiseOp, MatrixBase::minCoeff()
-  */
-template<typename LhsScalar,typename RhsScalar>
-struct scalar_min_op : binary_op_base<LhsScalar,RhsScalar>
-{
-  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_min_op>::ReturnType result_type;
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_min_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return numext::mini(a, b); }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::pmin(a,b); }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
-  { return internal::predux_min(a); }
-};
-template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_min_op<LhsScalar,RhsScalar> > {
-  enum {
-    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
-    PacketAccess = internal::is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasMin
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the max of two scalars
-  *
-  * \sa class CwiseBinaryOp, MatrixBase::cwiseMax, class VectorwiseOp, MatrixBase::maxCoeff()
-  */
-template<typename LhsScalar,typename RhsScalar>
-struct scalar_max_op  : binary_op_base<LhsScalar,RhsScalar>
-{
-  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_max_op>::ReturnType result_type;
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_max_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return numext::maxi(a, b); }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::pmax(a,b); }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
-  { return internal::predux_max(a); }
-};
-template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_max_op<LhsScalar,RhsScalar> > {
-  enum {
-    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
-    PacketAccess = internal::is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasMax
-  };
-};
-
-/** \internal
-  * \brief Template functors for comparison of two scalars
-  * \todo Implement packet-comparisons
-  */
-template<typename LhsScalar, typename RhsScalar, ComparisonName cmp> struct scalar_cmp_op;
-
-template<typename LhsScalar, typename RhsScalar, ComparisonName cmp>
-struct functor_traits<scalar_cmp_op<LhsScalar,RhsScalar, cmp> > {
-  enum {
-    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
-    PacketAccess = false
-  };
-};
-
-template<ComparisonName Cmp, typename LhsScalar, typename RhsScalar>
-struct result_of<scalar_cmp_op<LhsScalar, RhsScalar, Cmp>(LhsScalar,RhsScalar)> {
-  typedef bool type;
-};
-
-
-template<typename LhsScalar, typename RhsScalar>
-struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_EQ> : binary_op_base<LhsScalar,RhsScalar>
-{
-  typedef bool result_type;
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a==b;}
-};
-template<typename LhsScalar, typename RhsScalar>
-struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_LT> : binary_op_base<LhsScalar,RhsScalar>
-{
-  typedef bool result_type;
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a<b;}
-};
-template<typename LhsScalar, typename RhsScalar>
-struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_LE> : binary_op_base<LhsScalar,RhsScalar>
-{
-  typedef bool result_type;
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a<=b;}
-};
-template<typename LhsScalar, typename RhsScalar>
-struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_GT> : binary_op_base<LhsScalar,RhsScalar>
-{
-  typedef bool result_type;
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a>b;}
-};
-template<typename LhsScalar, typename RhsScalar>
-struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_GE> : binary_op_base<LhsScalar,RhsScalar>
-{
-  typedef bool result_type;
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a>=b;}
-};
-template<typename LhsScalar, typename RhsScalar>
-struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_UNORD> : binary_op_base<LhsScalar,RhsScalar>
-{
-  typedef bool result_type;
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return !(a<=b || b<=a);}
-};
-template<typename LhsScalar, typename RhsScalar>
-struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_NEQ> : binary_op_base<LhsScalar,RhsScalar>
-{
-  typedef bool result_type;
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a!=b;}
-};
-
-
-/** \internal
-  * \brief Template functor to compute the hypot of two \b positive \b and \b real scalars
-  *
-  * \sa MatrixBase::stableNorm(), class Redux
-  */
-template<typename Scalar>
-struct scalar_hypot_op<Scalar,Scalar> : binary_op_base<Scalar,Scalar>
-{
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_hypot_op)
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar &x, const Scalar &y) const
-  {
-    // This functor is used by hypotNorm only for which it is faster to first apply abs
-    // on all coefficients prior to reduction through hypot.
-    // This way we avoid calling abs on positive and real entries, and this also permits
-    // to seamlessly handle complexes. Otherwise we would have to handle both real and complexes
-    // through the same functor...
-    return internal::positive_real_hypot(x,y);
-  }
-};
-template<typename Scalar>
-struct functor_traits<scalar_hypot_op<Scalar,Scalar> > {
-  enum
-  {
-    Cost = 3 * NumTraits<Scalar>::AddCost +
-           2 * NumTraits<Scalar>::MulCost +
-           2 * scalar_div_cost<Scalar,false>::value,
-    PacketAccess = false
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the pow of two scalars
-  */
-template<typename Scalar, typename Exponent>
-struct scalar_pow_op  : binary_op_base<Scalar,Exponent>
-{
-  typedef typename ScalarBinaryOpTraits<Scalar,Exponent,scalar_pow_op>::ReturnType result_type;
-#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_pow_op)
-#else
-  scalar_pow_op() {
-    typedef Scalar LhsScalar;
-    typedef Exponent RhsScalar;
-    EIGEN_SCALAR_BINARY_OP_PLUGIN
-  }
-#endif
-  EIGEN_DEVICE_FUNC
-  inline result_type operator() (const Scalar& a, const Exponent& b) const { return numext::pow(a, b); }
-};
-template<typename Scalar, typename Exponent>
-struct functor_traits<scalar_pow_op<Scalar,Exponent> > {
-  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
-};
-
-
-
-//---------- non associative binary functors ----------
-
-/** \internal
-  * \brief Template functor to compute the difference of two scalars
-  *
-  * \sa class CwiseBinaryOp, MatrixBase::operator-
-  */
-template<typename LhsScalar,typename RhsScalar>
-struct scalar_difference_op : binary_op_base<LhsScalar,RhsScalar>
-{
-  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_difference_op>::ReturnType result_type;
-#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_difference_op)
-#else
-  scalar_difference_op() {
-    EIGEN_SCALAR_BINARY_OP_PLUGIN
-  }
-#endif
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a - b; }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::psub(a,b); }
-};
-template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_difference_op<LhsScalar,RhsScalar> > {
-  enum {
-    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
-    PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasSub && packet_traits<RhsScalar>::HasSub
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the quotient of two scalars
-  *
-  * \sa class CwiseBinaryOp, Cwise::operator/()
-  */
-template<typename LhsScalar,typename RhsScalar>
-struct scalar_quotient_op  : binary_op_base<LhsScalar,RhsScalar>
-{
-  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_quotient_op>::ReturnType result_type;
-#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_quotient_op)
-#else
-  scalar_quotient_op() {
-    EIGEN_SCALAR_BINARY_OP_PLUGIN
-  }
-#endif
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a / b; }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::pdiv(a,b); }
-};
-template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_quotient_op<LhsScalar,RhsScalar> > {
-  typedef typename scalar_quotient_op<LhsScalar,RhsScalar>::result_type result_type;
-  enum {
-    PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasDiv && packet_traits<RhsScalar>::HasDiv,
-    Cost = scalar_div_cost<result_type,PacketAccess>::value
-  };
-};
-
-
-
-/** \internal
-  * \brief Template functor to compute the and of two booleans
-  *
-  * \sa class CwiseBinaryOp, ArrayBase::operator&&
-  */
-struct scalar_boolean_and_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_and_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a && b; }
-};
-template<> struct functor_traits<scalar_boolean_and_op> {
-  enum {
-    Cost = NumTraits<bool>::AddCost,
-    PacketAccess = false
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the or of two booleans
-  *
-  * \sa class CwiseBinaryOp, ArrayBase::operator||
-  */
-struct scalar_boolean_or_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_or_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a || b; }
-};
-template<> struct functor_traits<scalar_boolean_or_op> {
-  enum {
-    Cost = NumTraits<bool>::AddCost,
-    PacketAccess = false
-  };
-};
-
-/** \internal
- * \brief Template functor to compute the xor of two booleans
- *
- * \sa class CwiseBinaryOp, ArrayBase::operator^
- */
-struct scalar_boolean_xor_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_xor_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a ^ b; }
-};
-template<> struct functor_traits<scalar_boolean_xor_op> {
-  enum {
-    Cost = NumTraits<bool>::AddCost,
-    PacketAccess = false
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the absolute difference of two scalars
-  *
-  * \sa class CwiseBinaryOp, MatrixBase::absolute_difference
-  */
-template<typename LhsScalar,typename RhsScalar>
-struct scalar_absolute_difference_op : binary_op_base<LhsScalar,RhsScalar>
-{
-  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_absolute_difference_op>::ReturnType result_type;
-#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_absolute_difference_op)
-#else
-  scalar_absolute_difference_op() {
-    EIGEN_SCALAR_BINARY_OP_PLUGIN
-  }
-#endif
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const
-  { return numext::absdiff(a,b); }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::pabsdiff(a,b); }
-};
-template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_absolute_difference_op<LhsScalar,RhsScalar> > {
-  enum {
-    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
-    PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasAbsDiff
-  };
-};
-
-
-
-//---------- binary functors bound to a constant, thus appearing as a unary functor ----------
-
-// The following two classes permits to turn any binary functor into a unary one with one argument bound to a constant value.
-// They are analogues to std::binder1st/binder2nd but with the following differences:
-//  - they are compatible with packetOp
-//  - they are portable across C++ versions (the std::binder* are deprecated in C++11)
-template<typename BinaryOp> struct bind1st_op : BinaryOp {
-
-  typedef typename BinaryOp::first_argument_type  first_argument_type;
-  typedef typename BinaryOp::second_argument_type second_argument_type;
-  typedef typename BinaryOp::result_type          result_type;
-
-  EIGEN_DEVICE_FUNC explicit bind1st_op(const first_argument_type &val) : m_value(val) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const second_argument_type& b) const { return BinaryOp::operator()(m_value,b); }
-
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& b) const
-  { return BinaryOp::packetOp(internal::pset1<Packet>(m_value), b); }
-
-  first_argument_type m_value;
-};
-template<typename BinaryOp> struct functor_traits<bind1st_op<BinaryOp> > : functor_traits<BinaryOp> {};
-
-
-template<typename BinaryOp> struct bind2nd_op : BinaryOp {
-
-  typedef typename BinaryOp::first_argument_type  first_argument_type;
-  typedef typename BinaryOp::second_argument_type second_argument_type;
-  typedef typename BinaryOp::result_type          result_type;
-
-  EIGEN_DEVICE_FUNC explicit bind2nd_op(const second_argument_type &val) : m_value(val) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const first_argument_type& a) const { return BinaryOp::operator()(a,m_value); }
-
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
-  { return BinaryOp::packetOp(a,internal::pset1<Packet>(m_value)); }
-
-  second_argument_type m_value;
-};
-template<typename BinaryOp> struct functor_traits<bind2nd_op<BinaryOp> > : functor_traits<BinaryOp> {};
-
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_BINARY_FUNCTORS_H
-
-// clang-format on
--- a/patches/eigen/Geometry_SSE.h
+++ b/patches/eigen/Geometry_SSE.h
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2009 Rohit Garg <rpg.314@gmail.com>
-// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_GEOMETRY_SSE_H
-#define EIGEN_GEOMETRY_SSE_H
-
-namespace Eigen {
-
-namespace internal {
-
-template <class Derived, class OtherDerived>
-struct quat_product<Architecture::SSE, Derived, OtherDerived, float> {
-  enum {
-    AAlignment = traits<Derived>::Alignment,
-    BAlignment = traits<OtherDerived>::Alignment,
-    ResAlignment = traits<Quaternion<float>>::Alignment
-  };
-  static inline Quaternion<float> run(const QuaternionBase<Derived>& _a,
-                                      const QuaternionBase<OtherDerived>& _b) {
-    evaluator<typename Derived::Coefficients> ae(_a.coeffs());
-    evaluator<typename OtherDerived::Coefficients> be(_b.coeffs());
-    Quaternion<float> res;
-    const __m128 mask = _mm_setr_ps(0.f, 0.f, 0.f, -0.f);
-    __m128 a = ae.template packet<AAlignment, __m128>(0);
-    __m128 b = be.template packet<BAlignment, __m128>(0);
-    __m128 s1 =
-        pmul(vec4f_swizzle1(a, 1, 2, 0, 2), vec4f_swizzle1(b, 2, 0, 1, 2));
-    __m128 s2 =
-        pmul(vec4f_swizzle1(a, 3, 3, 3, 1), vec4f_swizzle1(b, 0, 1, 2, 1));
-    pstoret<float, __m128, ResAlignment>(
-        &res.x(),
-        padd(psub(pmul(a, vec4f_swizzle1(b, 3, 3, 3, 3)),
-                  pmul(vec4f_swizzle1(a, 2, 0, 1, 0),
-                       vec4f_swizzle1(b, 1, 2, 0, 0))),
-             pxor(mask, padd(s1, s2))));
-
-    return res;
-  }
-};
-
-template <class Derived>
-struct quat_conj<Architecture::SSE, Derived, float> {
-  enum { ResAlignment = traits<Quaternion<float>>::Alignment };
-  static inline Quaternion<float> run(const QuaternionBase<Derived>& q) {
-    evaluator<typename Derived::Coefficients> qe(q.coeffs());
-    Quaternion<float> res;
-    const Packet4f mask = _mm_setr_ps(-0.f, -0.f, -0.f, 0.f);
-    pstoret<float, Packet4f, ResAlignment>(
-        &res.x(),
-        pxor(mask,
-             qe.template packet<traits<Derived>::Alignment, Packet4f>(0)));
-    return res;
-  }
-};
-
-template <typename VectorLhs, typename VectorRhs>
-struct cross3_impl<Architecture::SSE, VectorLhs, VectorRhs, float, true> {
-  enum {
-    ResAlignment =
-        traits<typename plain_matrix_type<VectorLhs>::type>::Alignment
-  };
-  static inline typename plain_matrix_type<VectorLhs>::type run(
-      const VectorLhs& lhs, const VectorRhs& rhs) {
-    evaluator<VectorLhs> lhs_eval(lhs);
-    evaluator<VectorRhs> rhs_eval(rhs);
-    __m128 a =
-        lhs_eval.template packet<traits<VectorLhs>::Alignment, __m128>(0);
-    __m128 b =
-        rhs_eval.template packet<traits<VectorRhs>::Alignment, __m128>(0);
-    __m128 mul1 =
-        pmul(vec4f_swizzle1(a, 1, 2, 0, 3), vec4f_swizzle1(b, 2, 0, 1, 3));
-    __m128 mul2 =
-        pmul(vec4f_swizzle1(a, 2, 0, 1, 3), vec4f_swizzle1(b, 1, 2, 0, 3));
-    typename plain_matrix_type<VectorLhs>::type res;
-    pstoret<float, __m128, ResAlignment>(&res.x(), psub(mul1, mul2));
-    return res;
-  }
-};
-
-template <class Derived, class OtherDerived>
-struct quat_product<Architecture::SSE, Derived, OtherDerived, double> {
-  enum {
-    BAlignment = traits<OtherDerived>::Alignment,
-    ResAlignment = traits<Quaternion<double>>::Alignment
-  };
-
-  static inline Quaternion<double> run(const QuaternionBase<Derived>& _a,
-                                       const QuaternionBase<OtherDerived>& _b) {
-    const Packet2d mask =
-        _mm_castsi128_pd(_mm_set_epi32(0x0, 0x0, 0x80000000, 0x0));
-
-    Quaternion<double> res;
-
-    evaluator<typename Derived::Coefficients> ae(_a.coeffs());
-    evaluator<typename OtherDerived::Coefficients> be(_b.coeffs());
-
-    const double* a = _a.coeffs().data();
-    Packet2d b_xy = be.template packet<BAlignment, Packet2d>(0);
-    Packet2d b_zw = be.template packet<BAlignment, Packet2d>(2);
-    Packet2d a_xx = pset1<Packet2d>(a[0]);
-    Packet2d a_yy = pset1<Packet2d>(a[1]);
-    Packet2d a_zz = pset1<Packet2d>(a[2]);
-    Packet2d a_ww = pset1<Packet2d>(a[3]);
-
-    // two temporaries:
-    Packet2d t1, t2;
-
-    /*
-     * t1 = ww*xy + yy*zw
-     * t2 = zz*xy - xx*zw
-     * res.xy = t1 +/- swap(t2)
-     */
-    t1 = padd(pmul(a_ww, b_xy), pmul(a_yy, b_zw));
-    t2 = psub(pmul(a_zz, b_xy), pmul(a_xx, b_zw));
-#ifdef EIGEN_VECTORIZE_SSE3
-    EIGEN_UNUSED_VARIABLE(mask)
-    pstoret<double, Packet2d, ResAlignment>(&res.x(),
-                                            _mm_addsub_pd(t1, preverse(t2)));
-#else
-    pstoret<double, Packet2d, ResAlignment>(&res.x(),
-                                            padd(t1, pxor(mask, preverse(t2))));
-#endif
-
-    /*
-     * t1 = ww*zw - yy*xy
-     * t2 = zz*zw + xx*xy
-     * res.zw = t1 -/+ swap(t2) = swap( swap(t1) +/- t2)
-     */
-    t1 = psub(pmul(a_ww, b_zw), pmul(a_yy, b_xy));
-    t2 = padd(pmul(a_zz, b_zw), pmul(a_xx, b_xy));
-#ifdef EIGEN_VECTORIZE_SSE3
-    EIGEN_UNUSED_VARIABLE(mask)
-    pstoret<double, Packet2d, ResAlignment>(
-        &res.z(), preverse(_mm_addsub_pd(preverse(t1), t2)));
-#else
-    pstoret<double, Packet2d, ResAlignment>(&res.z(),
-                                            psub(t1, pxor(mask, preverse(t2))));
-#endif
-
-    return res;
-  }
-};
-
-template <class Derived>
-struct quat_conj<Architecture::SSE, Derived, double> {
-  enum { ResAlignment = traits<Quaternion<double>>::Alignment };
-  static inline Quaternion<double> run(const QuaternionBase<Derived>& q) {
-    evaluator<typename Derived::Coefficients> qe(q.coeffs());
-    Quaternion<double> res;
-    const Packet2d mask0 = _mm_setr_pd(-0., -0.);
-    const Packet2d mask2 = _mm_setr_pd(-0., 0.);
-    pstoret<double, Packet2d, ResAlignment>(
-        &res.x(),
-        pxor(mask0,
-             qe.template packet<traits<Derived>::Alignment, Packet2d>(0)));
-    pstoret<double, Packet2d, ResAlignment>(
-        &res.z(),
-        pxor(mask2,
-             qe.template packet<traits<Derived>::Alignment, Packet2d>(2)));
-    return res;
-  }
-};
-
-}  // end namespace internal
-
-}  // end namespace Eigen
-
-#endif  // EIGEN_GEOMETRY_SSE_H
--- a/patches/eigen/Half.h
+++ b/patches/eigen/Half.h
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-//
-// The conversion routines are Copyright (c) Fabian Giesen, 2016.
-// The original license follows:
-//
-// Copyright (c) Fabian Giesen, 2016
-// All rights reserved.
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// Standard 16-bit float type, mostly useful for GPUs. Defines a new
-// type Eigen::half (inheriting from CUDA's __half struct) with
-// operator overloads such that it behaves basically as an arithmetic
-// type. It will be quite slow on CPUs (so it is recommended to stay
-// in fp32 for CPUs, except for simple parameter conversions, I/O
-// to disk and the likes), but fast on GPUs.
-
-#ifndef EIGEN_HALF_CUDA_H
-#define EIGEN_HALF_CUDA_H
-
-#if __cplusplus > 199711L
-#define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type()
-#else
-#define EIGEN_EXPLICIT_CAST(tgt_type) operator tgt_type()
-#endif
-
-namespace Eigen {
-
-struct half;
-
-namespace half_impl {
-
-#if !defined(EIGEN_HAS_CUDA_FP16)
-// Make our own __half_raw definition that is similar to CUDA's.
-struct __half_raw {
-  EIGEN_DEVICE_FUNC __half_raw() : x(0) {}
-  explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {}
-  unsigned short x;
-};
-#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
-// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
-typedef __half __half_raw;
-#endif
-
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw
-raw_uint16_to_half(unsigned short x);
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff);
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h);
-
-struct half_base : public __half_raw {
-  EIGEN_DEVICE_FUNC half_base() {}
-  EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {}
-  EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {}
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && \
-    EIGEN_CUDACC_VER >= 90000
-  EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
-#endif
-};
-
-}  // namespace half_impl
-
-// Class definition.
-struct half : public half_impl::half_base {
-#if !defined(EIGEN_HAS_CUDA_FP16) || \
-    (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000)
-  typedef half_impl::__half_raw __half_raw;
-#endif
-
-  EIGEN_DEVICE_FUNC half() {}
-
-  EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {}
-  EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {}
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && \
-    EIGEN_CUDACC_VER >= 90000
-  EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
-#endif
-
-  explicit EIGEN_DEVICE_FUNC half(bool b)
-      : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
-  template <class T>
-  explicit EIGEN_DEVICE_FUNC half(const T& val)
-      : half_impl::half_base(
-            half_impl::float_to_half_rtne(static_cast<float>(val))) {}
-  explicit EIGEN_DEVICE_FUNC half(float f)
-      : half_impl::half_base(half_impl::float_to_half_rtne(f)) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(bool) const {
-    // +0.0 and -0.0 become false, everything else becomes true.
-    return (x & 0x7fff) != 0;
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(signed char) const {
-    return static_cast<signed char>(half_impl::half_to_float(*this));
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned char) const {
-    return static_cast<unsigned char>(half_impl::half_to_float(*this));
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(short) const {
-    return static_cast<short>(half_impl::half_to_float(*this));
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned short) const {
-    return static_cast<unsigned short>(half_impl::half_to_float(*this));
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(int) const {
-    return static_cast<int>(half_impl::half_to_float(*this));
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned int) const {
-    return static_cast<unsigned int>(half_impl::half_to_float(*this));
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long) const {
-    return static_cast<long>(half_impl::half_to_float(*this));
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long) const {
-    return static_cast<unsigned long>(half_impl::half_to_float(*this));
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long long) const {
-    return static_cast<long long>(half_impl::half_to_float(*this));
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long long) const {
-    return static_cast<unsigned long long>(half_to_float(*this));
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(float) const {
-    return half_impl::half_to_float(*this);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(double) const {
-    return static_cast<double>(half_impl::half_to_float(*this));
-  }
-
-  EIGEN_DEVICE_FUNC half& operator=(const half& other) {
-    x = other.x;
-    return *this;
-  }
-};
-
-namespace half_impl {
-
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \
-    EIGEN_CUDA_ARCH >= 530
-
-// Intrinsics for native fp16 support. Note that on current hardware,
-// these are no faster than fp32 arithmetic (you need to use the half2
-// versions to get the ALU speed increased), but you do save the
-// conversion steps back and forth.
-
-EIGEN_STRONG_INLINE __device__ half operator+(const half& a, const half& b) {
-#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
-  return __hadd(::__half(a), ::__half(b));
-#else
-  return __hadd(a, b);
-#endif
-}
-EIGEN_STRONG_INLINE __device__ half operator*(const half& a, const half& b) {
-  return __hmul(a, b);
-}
-EIGEN_STRONG_INLINE __device__ half operator-(const half& a, const half& b) {
-  return __hsub(a, b);
-}
-EIGEN_STRONG_INLINE __device__ half operator/(const half& a, const half& b) {
-#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
-  return __hdiv(a, b);
-#else
-  float num = __half2float(a);
-  float denom = __half2float(b);
-  return __float2half(num / denom);
-#endif
-}
-EIGEN_STRONG_INLINE __device__ half operator-(const half& a) {
-  return __hneg(a);
-}
-EIGEN_STRONG_INLINE __device__ half& operator+=(half& a, const half& b) {
-  a = a + b;
-  return a;
-}
-EIGEN_STRONG_INLINE __device__ half& operator*=(half& a, const half& b) {
-  a = a * b;
-  return a;
-}
-EIGEN_STRONG_INLINE __device__ half& operator-=(half& a, const half& b) {
-  a = a - b;
-  return a;
-}
-EIGEN_STRONG_INLINE __device__ half& operator/=(half& a, const half& b) {
-  a = a / b;
-  return a;
-}
-EIGEN_STRONG_INLINE __device__ bool operator==(const half& a, const half& b) {
-  return __heq(a, b);
-}
-EIGEN_STRONG_INLINE __device__ bool operator!=(const half& a, const half& b) {
-  return __hne(a, b);
-}
-EIGEN_STRONG_INLINE __device__ bool operator<(const half& a, const half& b) {
-  return __hlt(a, b);
-}
-EIGEN_STRONG_INLINE __device__ bool operator<=(const half& a, const half& b) {
-  return __hle(a, b);
-}
-EIGEN_STRONG_INLINE __device__ bool operator>(const half& a, const half& b) {
-  return __hgt(a, b);
-}
-EIGEN_STRONG_INLINE __device__ bool operator>=(const half& a, const half& b) {
-  return __hge(a, b);
-}
-
-#else  // Emulate support for half floats
-
-// Definitions for CPUs and older CUDA, mostly working through conversion
-// to/from fp32.
-
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator+(const half& a,
-                                                     const half& b) {
-  return half(float(a) + float(b));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator*(const half& a,
-                                                     const half& b) {
-  return half(float(a) * float(b));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator-(const half& a,
-                                                     const half& b) {
-  return half(float(a) - float(b));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator/(const half& a,
-                                                     const half& b) {
-  return half(float(a) / float(b));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator-(const half& a) {
-  half result;
-  result.x = a.x ^ 0x8000;
-  return result;
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator+=(half& a, const half& b) {
-  a = half(float(a) + float(b));
-  return a;
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator*=(half& a, const half& b) {
-  a = half(float(a) * float(b));
-  return a;
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator-=(half& a, const half& b) {
-  a = half(float(a) - float(b));
-  return a;
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator/=(half& a, const half& b) {
-  a = half(float(a) / float(b));
-  return a;
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator==(const half& a,
-                                                      const half& b) {
-  return float(a) == float(b);
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator!=(const half& a,
-                                                      const half& b) {
-  return float(a) != float(b);
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<(const half& a,
-                                                     const half& b) {
-  return float(a) < float(b);
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<=(const half& a,
-                                                      const half& b) {
-  return float(a) <= float(b);
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>(const half& a,
-                                                     const half& b) {
-  return float(a) > float(b);
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(const half& a,
-                                                      const half& b) {
-  return float(a) >= float(b);
-}
-
-#endif  // Emulate support for half floats
-
-// Division by an index. Do it in full float precision to avoid accuracy
-// issues in converting the denominator to half.
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator/(const half& a, Index b) {
-  return half(static_cast<float>(a) / static_cast<float>(b));
-}
-
-// Conversion routines, including fallbacks for the host or older CUDA.
-// Note that newer Intel CPUs (Haswell or newer) have vectorized versions of
-// these in hardware. If we need more performance on older/other CPUs, they are
-// also possible to vectorize directly.
-
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw
-raw_uint16_to_half(unsigned short x) {
-  __half_raw h;
-  h.x = x;
-  return h;
-}
-
-union FP32 {
-  unsigned int u;
-  float f;
-};
-
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \
-    EIGEN_CUDA_ARCH >= 300
-  __half tmp_ff = __float2half(ff);
-  return *(__half_raw*)&tmp_ff;
-
-#elif defined(EIGEN_HAS_FP16_C)
-  __half_raw h;
-  h.x = _cvtss_sh(ff, 0);
-  return h;
-
-#else
-  FP32 f;
-  f.f = ff;
-
-  const FP32 f32infty = {255 << 23};
-  const FP32 f16max = {(127 + 16) << 23};
-  const FP32 denorm_magic = {((127 - 15) + (23 - 10) + 1) << 23};
-  unsigned int sign_mask = 0x80000000u;
-  __half_raw o;
-  o.x = static_cast<unsigned short>(0x0u);
-
-  unsigned int sign = f.u & sign_mask;
-  f.u ^= sign;
-
-  // NOTE all the integer compares in this function can be safely
-  // compiled into signed compares since all operands are below
-  // 0x80000000. Important if you want fast straight SSE2 code
-  // (since there's no unsigned PCMPGTD).
-
-  if (f.u >= f16max.u) {  // result is Inf or NaN (all exponent bits set)
-    o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00;  // NaN->qNaN and Inf->Inf
-  } else {                    // (De)normalized number or zero
-    if (f.u < (113 << 23)) {  // resulting FP16 is subnormal or zero
-      // use a magic value to align our 10 mantissa bits at the bottom of
-      // the float. as long as FP addition is round-to-nearest-even this
-      // just works.
-      f.f += denorm_magic.f;
-
-      // and one integer subtract of the bias later, we have our final float!
-      o.x = static_cast<unsigned short>(f.u - denorm_magic.u);
-    } else {
-      unsigned int mant_odd = (f.u >> 13) & 1;  // resulting mantissa is odd
-
-      // update exponent, rounding bias part 1
-      f.u += ((unsigned int)(15 - 127) << 23) + 0xfff;
-      // rounding bias part 2
-      f.u += mant_odd;
-      // take the bits!
-      o.x = static_cast<unsigned short>(f.u >> 13);
-    }
-  }
-
-  o.x |= static_cast<unsigned short>(sign >> 16);
-  return o;
-#endif
-}
-
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \
-    EIGEN_CUDA_ARCH >= 300
-  return __half2float(h);
-
-#elif defined(EIGEN_HAS_FP16_C)
-  return _cvtsh_ss(h.x);
-
-#else
-  const FP32 magic = {113 << 23};
-  const unsigned int shifted_exp = 0x7c00 << 13;  // exponent mask after shift
-  FP32 o;
-
-  o.u = (h.x & 0x7fff) << 13;            // exponent/mantissa bits
-  unsigned int exp = shifted_exp & o.u;  // just the exponent
-  o.u += (127 - 15) << 23;               // exponent adjust
-
-  // handle exponent special cases
-  if (exp == shifted_exp) {   // Inf/NaN?
-    o.u += (128 - 16) << 23;  // extra exp adjust
-  } else if (exp == 0) {      // Zero/Denormal?
-    o.u += 1 << 23;           // extra exp adjust
-    o.f -= magic.f;           // renormalize
-  }
-
-  o.u |= (h.x & 0x8000) << 16;  // sign bit
-  return o.f;
-#endif
-}
-
-// --- standard functions ---
-
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isinf)(const half& a) {
-  return (a.x & 0x7fff) == 0x7c00;
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isnan)(const half& a) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \
-    EIGEN_CUDA_ARCH >= 530
-  return __hisnan(a);
-#else
-  return (a.x & 0x7fff) > 0x7c00;
-#endif
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isfinite)(const half& a) {
-  return !(isinf EIGEN_NOT_A_MACRO(a)) && !(isnan EIGEN_NOT_A_MACRO(a));
-}
-
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
-  half result;
-  result.x = a.x & 0x7FFF;
-  return result;
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
-#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && \
-    EIGEN_CUDA_ARCH >= 530
-  return half(hexp(a));
-#else
-  return half(::expf(float(a)));
-#endif
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) {
-  return half(numext::expm1(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
-#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && \
-    defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
-  return half(::hlog(a));
-#else
-  return half(::logf(float(a)));
-#endif
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log1p(const half& a) {
-  return half(numext::log1p(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) {
-  return half(::log10f(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
-#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && \
-    EIGEN_CUDA_ARCH >= 530
-  return half(hsqrt(a));
-#else
-  return half(::sqrtf(float(a)));
-#endif
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half pow(const half& a, const half& b) {
-  return half(::powf(float(a), float(b)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sin(const half& a) {
-  return half(::sinf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half cos(const half& a) {
-  return half(::cosf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tan(const half& a) {
-  return half(::tanf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) {
-  return half(::tanhf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
-#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && \
-    EIGEN_CUDA_ARCH >= 300
-  return half(hfloor(a));
-#else
-  return half(::floorf(float(a)));
-#endif
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
-#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && \
-    EIGEN_CUDA_ARCH >= 300
-  return half(hceil(a));
-#else
-  return half(::ceilf(float(a)));
-#endif
-}
-
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half(min)(const half& a, const half& b) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \
-    EIGEN_CUDA_ARCH >= 530
-  return __hlt(b, a) ? b : a;
-#else
-  const float f1 = static_cast<float>(a);
-  const float f2 = static_cast<float>(b);
-  return f2 < f1 ? b : a;
-#endif
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half(max)(const half& a, const half& b) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \
-    EIGEN_CUDA_ARCH >= 530
-  return __hlt(a, b) ? b : a;
-#else
-  const float f1 = static_cast<float>(a);
-  const float f2 = static_cast<float>(b);
-  return f1 < f2 ? b : a;
-#endif
-}
-
-EIGEN_ALWAYS_INLINE std::ostream& operator<<(std::ostream& os, const half& v) {
-  os << static_cast<float>(v);
-  return os;
-}
-
-}  // end namespace half_impl
-
-// import Eigen::half_impl::half into Eigen namespace
-// using half_impl::half;
-
-namespace internal {
-
-template <>
-struct random_default_impl<half, false, false> {
-  static inline half run(const half& x, const half& y) {
-    return x + (y - x) * half(float(std::rand()) / float(RAND_MAX));
-  }
-  static inline half run() { return run(half(-1.f), half(1.f)); }
-};
-
-template <>
-struct is_arithmetic<half> {
-  enum { value = true };
-};
-
-}  // end namespace internal
-
-}  // end namespace Eigen
-
-namespace std {
-template <>
-struct numeric_limits<Eigen::half> {
-  static const bool is_specialized = true;
-  static const bool is_signed = true;
-  static const bool is_integer = false;
-  static const bool is_exact = false;
-  static const bool has_infinity = true;
-  static const bool has_quiet_NaN = true;
-  static const bool has_signaling_NaN = true;
-  static const float_denorm_style has_denorm = denorm_present;
-  static const bool has_denorm_loss = false;
-  static const std::float_round_style round_style = std::round_to_nearest;
-  static const bool is_iec559 = false;
-  static const bool is_bounded = false;
-  static const bool is_modulo = false;
-  static const int digits = 11;
-  static const int digits10 = 3;      // according to
-                                      // http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
-  static const int max_digits10 = 5;  // according to
-                                      // http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
-  static const int radix = 2;
-  static const int min_exponent = -13;
-  static const int min_exponent10 = -4;
-  static const int max_exponent = 16;
-  static const int max_exponent10 = 4;
-  static const bool traps = true;
-  static const bool tinyness_before = false;
-
-  static Eigen::half(min)() {
-    return Eigen::half_impl::raw_uint16_to_half(0x400);
-  }
-  static Eigen::half lowest() {
-    return Eigen::half_impl::raw_uint16_to_half(0xfbff);
-  }
-  static Eigen::half(max)() {
-    return Eigen::half_impl::raw_uint16_to_half(0x7bff);
-  }
-  static Eigen::half epsilon() {
-    return Eigen::half_impl::raw_uint16_to_half(0x0800);
-  }
-  static Eigen::half round_error() { return Eigen::half(0.5); }
-  static Eigen::half infinity() {
-    return Eigen::half_impl::raw_uint16_to_half(0x7c00);
-  }
-  static Eigen::half quiet_NaN() {
-    return Eigen::half_impl::raw_uint16_to_half(0x7e00);
-  }
-  static Eigen::half signaling_NaN() {
-    return Eigen::half_impl::raw_uint16_to_half(0x7e00);
-  }
-  static Eigen::half denorm_min() {
-    return Eigen::half_impl::raw_uint16_to_half(0x1);
-  }
-};
-}
-
-namespace Eigen {
-
-template <>
-struct NumTraits<Eigen::half> : GenericNumTraits<Eigen::half> {
-  enum {
-    IsSigned = true,
-    IsInteger = false,
-    IsComplex = false,
-    RequireInitialization = false
-  };
-
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half epsilon() {
-    return half_impl::raw_uint16_to_half(0x0800);
-  }
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half dummy_precision() {
-    return Eigen::half(1e-2f);
-  }
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half highest() {
-    return half_impl::raw_uint16_to_half(0x7bff);
-  }
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half lowest() {
-    return half_impl::raw_uint16_to_half(0xfbff);
-  }
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half infinity() {
-    return half_impl::raw_uint16_to_half(0x7c00);
-  }
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() {
-    return half_impl::raw_uint16_to_half(0x7c01);
-  }
-};
-
-}  // end namespace Eigen
-
-// C-like standard mathematical functions and trancendentals.
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half fabsh(const Eigen::half& a) {
-  Eigen::half result;
-  result.x = a.x & 0x7FFF;
-  return result;
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) {
-  return Eigen::half(::expf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) {
-#if EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && \
-    EIGEN_CUDA_ARCH >= 530
-  return Eigen::half(::hlog(a));
-#else
-  return Eigen::half(::logf(float(a)));
-#endif
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sqrth(const Eigen::half& a) {
-  return Eigen::half(::sqrtf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half powh(const Eigen::half& a,
-                                                       const Eigen::half& b) {
-  return Eigen::half(::powf(float(a), float(b)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floorh(const Eigen::half& a) {
-  return Eigen::half(::floorf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ceilh(const Eigen::half& a) {
-  return Eigen::half(::ceilf(float(a)));
-}
-
-namespace std {
-
-#if __cplusplus > 199711L
-template <>
-struct hash<Eigen::half> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t operator()(
-      const Eigen::half& a) const {
-    return static_cast<std::size_t>(a.x);
-  }
-};
-#endif
-
-}  // end namespace std
-
-// Add the missing shfl_xor intrinsic
-#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
-__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var,
-                                                      int laneMask,
-                                                      int width = warpSize) {
-#if EIGEN_CUDACC_VER < 90000
-  return static_cast<Eigen::half>(
-      __shfl_xor(static_cast<float>(var), laneMask, width));
-#else
-  return static_cast<Eigen::half>(
-      __shfl_xor_sync(0xFFFFFFFF, static_cast<float>(var), laneMask, width));
-#endif
-}
-#endif
-
-// ldg() has an overload for __half_raw, but we also need one for Eigen::half.
-#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(
-    const Eigen::half* ptr) {
-  return Eigen::half_impl::raw_uint16_to_half(
-      __ldg(reinterpret_cast<const unsigned short*>(ptr)));
-}
-#endif
-
-#if defined(EIGEN_CUDA_ARCH)
-namespace Eigen {
-namespace numext {
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isnan)(const Eigen::half& h) {
-  return (half_impl::isnan)(h);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isinf)(const Eigen::half& h) {
-  return (half_impl::isinf)(h);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isfinite)(const Eigen::half& h) {
-  return (half_impl::isfinite)(h);
-}
-
-}  // namespace Eigen
-}  // namespace numext
-#endif
-
-#endif  // EIGEN_HALF_CUDA_H
--- a/patches/eigen/MathFunctions.h
+++ b/patches/eigen/MathFunctions.h
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2006-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_MATHFUNCTIONS_H
-#define EIGEN_MATHFUNCTIONS_H
-
-// source: http://www.geom.uiuc.edu/~huberty/math5337/groupe/digits.html
-// TODO this should better be moved to NumTraits
-#define EIGEN_PI \
-  3.141592653589793238462643383279502884197169399375105820974944592307816406L
-
-namespace Eigen {
-
-// On WINCE, std::abs is defined for int only, so let's defined our own
-// overloads:
-// This issue has been confirmed with MSVC 2008 only, but the issue might exist
-// for more recent versions too.
-#if EIGEN_OS_WINCE && EIGEN_COMP_MSVC && EIGEN_COMP_MSVC <= 1500
-long abs(long x) { return (labs(x)); }
-double abs(double x) { return (fabs(x)); }
-float abs(float x) { return (fabsf(x)); }
-long double abs(long double x) { return (fabsl(x)); }
-#endif
-
-namespace internal {
-
-/** \internal \class global_math_functions_filtering_base
-  *
-  * What it does:
-  * Defines a typedef 'type' as follows:
-  * - if type T has a member typedef
- * Eigen_BaseClassForSpecializationOfGlobalMathFuncImpl, then
-  *   global_math_functions_filtering_base<T>::type is a typedef for it.
-  * - otherwise, global_math_functions_filtering_base<T>::type is a typedef for
- * T.
-  *
-  * How it's used:
-  * To allow to defined the global math functions (like sin...) in certain
- * cases, like the Array expressions.
-  * When you do sin(array1+array2), the object array1+array2 has a complicated
- * expression type, all what you want to know
-  * is that it inherits ArrayBase. So we implement a partial specialization of
- * sin_impl for ArrayBase<Derived>.
-  * So we must make sure to use sin_impl<ArrayBase<Derived> > and not
- * sin_impl<Derived>, otherwise our partial specialization
-  * won't be used. How does sin know that? That's exactly what
- * global_math_functions_filtering_base tells it.
-  *
-  * How it's implemented:
-  * SFINAE in the style of enable_if. Highly susceptible of breaking compilers.
- * With GCC, it sure does work, but if you replace
-  * the typename dummy by an integer template parameter, it doesn't work
- * anymore!
-  */
-
-template <typename T, typename dummy = void>
-struct global_math_functions_filtering_base {
-  typedef T type;
-};
-
-template <typename T>
-struct always_void {
-  typedef void type;
-};
-
-template <typename T>
-struct global_math_functions_filtering_base<
-    T,
-    typename always_void<
-        typename T::Eigen_BaseClassForSpecializationOfGlobalMathFuncImpl>::
-        type> {
-  typedef typename T::Eigen_BaseClassForSpecializationOfGlobalMathFuncImpl type;
-};
-
-#define EIGEN_MATHFUNC_IMPL(func, scalar)                             \
-  Eigen::internal::func##_impl<                                       \
-      typename Eigen::internal::global_math_functions_filtering_base< \
-          scalar>::type>
-#define EIGEN_MATHFUNC_RETVAL(func, scalar)                           \
-  typename Eigen::internal::func##_retval<                            \
-      typename Eigen::internal::global_math_functions_filtering_base< \
-          scalar>::type>::type
-
-/****************************************************************************
-* Implementation of real                                                 *
-****************************************************************************/
-
-template <typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
-struct real_default_impl {
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
-  static inline RealScalar run(const Scalar& x) { return x; }
-};
-
-template <typename Scalar>
-struct real_default_impl<Scalar, true> {
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
-  static inline RealScalar run(const Scalar& x) {
-    using std::real;
-    return real(x);
-  }
-};
-
-template <typename Scalar>
-struct real_impl : real_default_impl<Scalar> {};
-
-#if defined(EIGEN_GPU_COMPILE_PHASE)
-template <typename T>
-struct real_impl<std::complex<T>> {
-  typedef T RealScalar;
-  EIGEN_DEVICE_FUNC
-  static inline T run(const std::complex<T>& x) { return x.real(); }
-};
-#endif
-
-template <typename Scalar>
-struct real_retval {
-  typedef typename NumTraits<Scalar>::Real type;
-};
-
-/****************************************************************************
-* Implementation of imag                                                 *
-****************************************************************************/
-
-template <typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
-struct imag_default_impl {
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
-  static inline RealScalar run(const Scalar&) { return RealScalar(0); }
-};
-
-template <typename Scalar>
-struct imag_default_impl<Scalar, true> {
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
-  static inline RealScalar run(const Scalar& x) {
-    using std::imag;
-    return imag(x);
-  }
-};
-
-template <typename Scalar>
-struct imag_impl : imag_default_impl<Scalar> {};
-
-#if defined(EIGEN_GPU_COMPILE_PHASE)
-template <typename T>
-struct imag_impl<std::complex<T>> {
-  typedef T RealScalar;
-  EIGEN_DEVICE_FUNC
-  static inline T run(const std::complex<T>& x) { return x.imag(); }
-};
-#endif
-
-template <typename Scalar>
-struct imag_retval {
-  typedef typename NumTraits<Scalar>::Real type;
-};
-
-/****************************************************************************
-* Implementation of real_ref                                             *
-****************************************************************************/
-
-template <typename Scalar>
-struct real_ref_impl {
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
-  static inline RealScalar& run(Scalar& x) {
-    return reinterpret_cast<RealScalar*>(&x)[0];
-  }
-  EIGEN_DEVICE_FUNC
-  static inline const RealScalar& run(const Scalar& x) {
-    return reinterpret_cast<const RealScalar*>(&x)[0];
-  }
-};
-
-template <typename Scalar>
-struct real_ref_retval {
-  typedef typename NumTraits<Scalar>::Real& type;
-};
-
-/****************************************************************************
-* Implementation of imag_ref                                             *
-****************************************************************************/
-
-template <typename Scalar, bool IsComplex>
-struct imag_ref_default_impl {
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
-  static inline RealScalar& run(Scalar& x) {
-    return reinterpret_cast<RealScalar*>(&x)[1];
-  }
-  EIGEN_DEVICE_FUNC
-  static inline const RealScalar& run(const Scalar& x) {
-    return reinterpret_cast<RealScalar*>(&x)[1];
-  }
-};
-
-template <typename Scalar>
-struct imag_ref_default_impl<Scalar, false> {
-  EIGEN_DEVICE_FUNC
-  static inline Scalar run(Scalar&) { return Scalar(0); }
-  EIGEN_DEVICE_FUNC
-  static inline const Scalar run(const Scalar&) { return Scalar(0); }
-};
-
-template <typename Scalar>
-struct imag_ref_impl
-    : imag_ref_default_impl<Scalar, NumTraits<Scalar>::IsComplex> {};
-
-template <typename Scalar>
-struct imag_ref_retval {
-  typedef typename NumTraits<Scalar>::Real& type;
-};
-
-/****************************************************************************
-* Implementation of conj                                                 *
-****************************************************************************/
-
-template <typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
-struct conj_default_impl {
-  EIGEN_DEVICE_FUNC
-  static inline Scalar run(const Scalar& x) { return x; }
-};
-
-template <typename Scalar>
-struct conj_default_impl<Scalar, true> {
-  EIGEN_DEVICE_FUNC
-  static inline Scalar run(const Scalar& x) {
-    using std::conj;
-    return conj(x);
-  }
-};
-
-template <typename Scalar>
-struct conj_impl : conj_default_impl<Scalar> {};
-
-#if defined(EIGEN_GPU_COMPILE_PHASE)
-template <typename T>
-struct conj_impl<std::complex<T>> {
-  EIGEN_DEVICE_FUNC
-  static inline std::complex<T> run(const std::complex<T>& x) {
-    return std::complex<T>(x.real(), -x.imag());
-  }
-};
-#endif
-
-template <typename Scalar>
-struct conj_retval {
-  typedef Scalar type;
-};
-
-/****************************************************************************
-* Implementation of abs2                                                 *
-****************************************************************************/
-
-template <typename Scalar, bool IsComplex>
-struct abs2_impl_default {
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
-  static inline RealScalar run(const Scalar& x) { return x * x; }
-};
-
-template <typename Scalar>
-struct abs2_impl_default<Scalar, true>  // IsComplex
-{
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
-  static inline RealScalar run(const Scalar& x) {
-    return x.real() * x.real() + x.imag() * x.imag();
-  }
-};
-
-template <typename Scalar>
-struct abs2_impl {
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
-  static inline RealScalar run(const Scalar& x) {
-    return abs2_impl_default<Scalar, NumTraits<Scalar>::IsComplex>::run(x);
-  }
-};
-
-template <typename Scalar>
-struct abs2_retval {
-  typedef typename NumTraits<Scalar>::Real type;
-};
-
-/****************************************************************************
-* Implementation of norm1                                                *
-****************************************************************************/
-
-template <typename Scalar, bool IsComplex>
-struct norm1_default_impl;
-
-template <typename Scalar>
-struct norm1_default_impl<Scalar, true> {
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
-  static inline RealScalar run(const Scalar& x) {
-    EIGEN_USING_STD_MATH(abs);
-    return abs(x.real()) + abs(x.imag());
-  }
-};
-
-template <typename Scalar>
-struct norm1_default_impl<Scalar, false> {
-  EIGEN_DEVICE_FUNC
-  static inline Scalar run(const Scalar& x) {
-    EIGEN_USING_STD_MATH(abs);
-    return abs(x);
-  }
-};
-
-template <typename Scalar>
-struct norm1_impl : norm1_default_impl<Scalar, NumTraits<Scalar>::IsComplex> {};
-
-template <typename Scalar>
-struct norm1_retval {
-  typedef typename NumTraits<Scalar>::Real type;
-};
-
-/****************************************************************************
-* Implementation of hypot                                                *
-****************************************************************************/
-
-template <typename Scalar>
-struct hypot_impl;
-
-template <typename Scalar>
-struct hypot_retval {
-  typedef typename NumTraits<Scalar>::Real type;
-};
-
-/****************************************************************************
-* Implementation of cast                                                 *
-****************************************************************************/
-
-template <typename OldType, typename NewType>
-struct cast_impl {
-  EIGEN_DEVICE_FUNC
-  static inline NewType run(const OldType& x) {
-    return static_cast<NewType>(x);
-  }
-};
-
-// here, for once, we're plainly returning NewType: we don't want cast to do
-// weird things.
-
-template <typename OldType, typename NewType>
-EIGEN_DEVICE_FUNC inline NewType cast(const OldType& x) {
-  return cast_impl<OldType, NewType>::run(x);
-}
-
-/****************************************************************************
-* Implementation of round                                                   *
-****************************************************************************/
-
-#if EIGEN_HAS_CXX11_MATH
-template <typename Scalar>
-struct round_impl {
-  EIGEN_DEVICE_FUNC
-  static inline Scalar run(const Scalar& x) {
-    EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex),
-                        NUMERIC_TYPE_MUST_BE_REAL)
-    EIGEN_USING_STD_MATH(round);
-    return round(x);
-  }
-};
-#else
-template <typename Scalar>
-struct round_impl {
-  EIGEN_DEVICE_FUNC
-  static inline Scalar run(const Scalar& x) {
-    EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex),
-                        NUMERIC_TYPE_MUST_BE_REAL)
-    EIGEN_USING_STD_MATH(floor);
-    EIGEN_USING_STD_MATH(ceil);
-    return (x > Scalar(0)) ? floor(x + Scalar(0.5)) : ceil(x - Scalar(0.5));
-  }
-};
-#endif
-
-template <typename Scalar>
-struct round_retval {
-  typedef Scalar type;
-};
-
-/****************************************************************************
-* Implementation of rint                                                    *
-****************************************************************************/
-
-template <typename Scalar>
-struct rint_impl {
-  EIGEN_DEVICE_FUNC
-  static inline Scalar run(const Scalar& x) {
-    EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex),
-                        NUMERIC_TYPE_MUST_BE_REAL)
-#if EIGEN_HAS_CXX11_MATH
-    EIGEN_USING_STD_MATH(rint);
-#endif
-    return rint(x);
-  }
-};
-
-#if !EIGEN_HAS_CXX11_MATH
-template <>
-struct rint_impl<double> {
-  EIGEN_DEVICE_FUNC
-  static inline double run(const double& x) { return ::rint(x); }
-};
-template <>
-struct rint_impl<float> {
-  EIGEN_DEVICE_FUNC
-  static inline float run(const float& x) { return ::rintf(x); }
-};
-#endif
-
-template <typename Scalar>
-struct rint_retval {
-  typedef Scalar type;
-};
-
-/****************************************************************************
-* Implementation of arg                                                     *
-****************************************************************************/
-
-#if EIGEN_HAS_CXX11_MATH
-template <typename Scalar>
-struct arg_impl {
-  EIGEN_DEVICE_FUNC
-  static inline Scalar run(const Scalar& x) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-    // HIP does not seem to have a native device side implementation for the
-    // math routine "arg"
-    using std::arg;
-#else
-    EIGEN_USING_STD_MATH(arg);
-#endif
-    return arg(x);
-  }
-};
-#else
-template <typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
-struct arg_default_impl {
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
-  static inline RealScalar run(const Scalar& x) {
-    return (x < Scalar(0)) ? Scalar(EIGEN_PI) : Scalar(0);
-  }
-};
-
-template <typename Scalar>
-struct arg_default_impl<Scalar, true> {
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
-  static inline RealScalar run(const Scalar& x) {
-    EIGEN_USING_STD_MATH(arg);
-    return arg(x);
-  }
-};
-
-template <typename Scalar>
-struct arg_impl : arg_default_impl<Scalar> {};
-#endif
-
-template <typename Scalar>
-struct arg_retval {
-  typedef typename NumTraits<Scalar>::Real type;
-};
-
-/****************************************************************************
-* Implementation of expm1                                                   *
-****************************************************************************/
-
-// This implementation is based on GSL Math's expm1.
-namespace std_fallback {
-// fallback expm1 implementation in case there is no expm1(Scalar) function in
-// namespace of Scalar,
-// or that there is no suitable std::expm1 function available. Implementation
-// attributed to Kahan. See: http://www.plunk.org/~hatch/rightway.php.
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline Scalar expm1(const Scalar& x) {
-  EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-
-  EIGEN_USING_STD_MATH(exp);
-  Scalar u = exp(x);
-  if (numext::equal_strict(u, Scalar(1))) {
-    return x;
-  }
-  Scalar um1 = u - RealScalar(1);
-  if (numext::equal_strict(um1, Scalar(-1))) {
-    return RealScalar(-1);
-  }
-
-  EIGEN_USING_STD_MATH(log);
-  Scalar logu = log(u);
-  return numext::equal_strict(u, logu) ? u : (u - RealScalar(1)) * x / logu;
-}
-}
-
-template <typename Scalar>
-struct expm1_impl {
-  EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) {
-    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
-#if EIGEN_HAS_CXX11_MATH
-    using std::expm1;
-#else
-    using std_fallback::expm1;
-#endif
-    return expm1(x);
-  }
-};
-
-// Specialization for complex types that are not supported by std::expm1.
-template <typename RealScalar>
-struct expm1_impl<std::complex<RealScalar>> {
-  EIGEN_DEVICE_FUNC static inline std::complex<RealScalar> run(
-      const std::complex<RealScalar>& x) {
-    EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar)
-    RealScalar xr = x.real();
-    RealScalar xi = x.imag();
-    // expm1(z) = exp(z) - 1
-    //          = exp(x +  i * y) - 1
-    //          = exp(x) * (cos(y) + i * sin(y)) - 1
-    //          = exp(x) * cos(y) - 1 + i * exp(x) * sin(y)
-    // Imag(expm1(z)) = exp(x) * sin(y)
-    // Real(expm1(z)) = exp(x) * cos(y) - 1
-    //          = exp(x) * cos(y) - 1.
-    //          = expm1(x) + exp(x) * (cos(y) - 1)
-    //          = expm1(x) + exp(x) * (2 * sin(y / 2) ** 2)
-
-    // TODO better use numext::expm1 and numext::sin (but that would require
-    // forward declarations or moving this specialization down).
-    RealScalar erm1 = expm1_impl<RealScalar>::run(xr);
-    RealScalar er = erm1 + RealScalar(1.);
-    EIGEN_USING_STD_MATH(sin);
-    RealScalar sin2 = sin(xi / RealScalar(2.));
-    sin2 = sin2 * sin2;
-    RealScalar s = sin(xi);
-    RealScalar real_part = erm1 - RealScalar(2.) * er * sin2;
-    return std::complex<RealScalar>(real_part, er * s);
-  }
-};
-
-template <typename Scalar>
-struct expm1_retval {
-  typedef Scalar type;
-};
-
-/****************************************************************************
-* Implementation of log1p                                                   *
-****************************************************************************/
-
-namespace std_fallback {
-// fallback log1p implementation in case there is no log1p(Scalar) function in
-// namespace of Scalar,
-// or that there is no suitable std::log1p function available
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline Scalar log1p(const Scalar& x) {
-  EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_USING_STD_MATH(log);
-  Scalar x1p = RealScalar(1) + x;
-  Scalar log_1p = log(x1p);
-  const bool is_small = numext::equal_strict(x1p, Scalar(1));
-  const bool is_inf = numext::equal_strict(x1p, log_1p);
-  return (is_small || is_inf) ? x : x * (log_1p / (x1p - RealScalar(1)));
-}
-}
-
-template <typename Scalar>
-struct log1p_impl {
-  EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) {
-    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
-#if EIGEN_HAS_CXX11_MATH
-    using std::log1p;
-#else
-    using std_fallback::log1p;
-#endif
-    return log1p(x);
-  }
-};
-
-// Specialization for complex types that are not supported by std::log1p.
-template <typename RealScalar>
-struct log1p_impl<std::complex<RealScalar>> {
-  EIGEN_DEVICE_FUNC static inline std::complex<RealScalar> run(
-      const std::complex<RealScalar>& x) {
-    EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar)
-    return std_fallback::log1p(x);
-  }
-};
-
-template <typename Scalar>
-struct log1p_retval {
-  typedef Scalar type;
-};
-
-/****************************************************************************
-* Implementation of pow                                                  *
-****************************************************************************/
-
-template <typename ScalarX,
-          typename ScalarY,
-          bool IsInteger =
-              NumTraits<ScalarX>::IsInteger&& NumTraits<ScalarY>::IsInteger>
-struct pow_impl {
-  // typedef Scalar retval;
-  typedef typename ScalarBinaryOpTraits<
-      ScalarX,
-      ScalarY,
-      internal::scalar_pow_op<ScalarX, ScalarY>>::ReturnType result_type;
-  static EIGEN_DEVICE_FUNC inline result_type run(const ScalarX& x,
-                                                  const ScalarY& y) {
-    EIGEN_USING_STD_MATH(pow);
-    return pow(x, y);
-  }
-};
-
-template <typename ScalarX, typename ScalarY>
-struct pow_impl<ScalarX, ScalarY, true> {
-  typedef ScalarX result_type;
-  static EIGEN_DEVICE_FUNC inline ScalarX run(ScalarX x, ScalarY y) {
-    ScalarX res(1);
-    eigen_assert(!NumTraits<ScalarY>::IsSigned || y >= 0);
-    if (y & 1) res *= x;
-    y >>= 1;
-    while (y) {
-      x *= x;
-      if (y & 1) res *= x;
-      y >>= 1;
-    }
-    return res;
-  }
-};
-
-/****************************************************************************
-* Implementation of random                                               *
-****************************************************************************/
-
-template <typename Scalar, bool IsComplex, bool IsInteger>
-struct random_default_impl {};
-
-template <typename Scalar>
-struct random_impl : random_default_impl<Scalar,
-                                         NumTraits<Scalar>::IsComplex,
-                                         NumTraits<Scalar>::IsInteger> {};
-
-template <typename Scalar>
-struct random_retval {
-  typedef Scalar type;
-};
-
-template <typename Scalar>
-inline EIGEN_MATHFUNC_RETVAL(random, Scalar)
-    random(const Scalar& x, const Scalar& y);
-template <typename Scalar>
-inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random();
-
-template <typename Scalar>
-struct random_default_impl<Scalar, false, false> {
-  static inline Scalar run(const Scalar& x, const Scalar& y) {
-    return x + (y - x) * Scalar(std::rand()) / Scalar(RAND_MAX);
-  }
-  static inline Scalar run() {
-    return run(Scalar(NumTraits<Scalar>::IsSigned ? -1 : 0), Scalar(1));
-  }
-};
-
-enum {
-  meta_floor_log2_terminate,
-  meta_floor_log2_move_up,
-  meta_floor_log2_move_down,
-  meta_floor_log2_bogus
-};
-
-template <unsigned int n, int lower, int upper>
-struct meta_floor_log2_selector {
-  enum {
-    middle = (lower + upper) / 2,
-    value = (upper <= lower + 1)
-                ? int(meta_floor_log2_terminate)
-                : (n < (1 << middle)) ? int(meta_floor_log2_move_down)
-                                      : (n == 0) ? int(meta_floor_log2_bogus)
-                                                 : int(meta_floor_log2_move_up)
-  };
-};
-
-template <unsigned int n,
-          int lower = 0,
-          int upper = sizeof(unsigned int) * CHAR_BIT - 1,
-          int selector = meta_floor_log2_selector<n, lower, upper>::value>
-struct meta_floor_log2 {};
-
-template <unsigned int n, int lower, int upper>
-struct meta_floor_log2<n, lower, upper, meta_floor_log2_move_down> {
-  enum {
-    value = meta_floor_log2<
-        n,
-        lower,
-        meta_floor_log2_selector<n, lower, upper>::middle>::value
-  };
-};
-
-template <unsigned int n, int lower, int upper>
-struct meta_floor_log2<n, lower, upper, meta_floor_log2_move_up> {
-  enum {
-    value = meta_floor_log2<n,
-                            meta_floor_log2_selector<n, lower, upper>::middle,
-                            upper>::value
-  };
-};
-
-template <unsigned int n, int lower, int upper>
-struct meta_floor_log2<n, lower, upper, meta_floor_log2_terminate> {
-  enum {
-    value = (n >= ((unsigned int)(1) << (lower + 1))) ? lower + 1 : lower
-  };
-};
-
-template <unsigned int n, int lower, int upper>
-struct meta_floor_log2<n, lower, upper, meta_floor_log2_bogus> {
-  // no value, error at compile time
-};
-
-template <typename Scalar>
-struct random_default_impl<Scalar, false, true> {
-  static inline Scalar run(const Scalar& x, const Scalar& y) {
-    if (y <= x) return x;
-    // ScalarU is the unsigned counterpart of Scalar, possibly Scalar itself.
-    typedef typename make_unsigned<Scalar>::type ScalarU;
-    // ScalarX is the widest of ScalarU and unsigned int.
-    // We'll deal only with ScalarX and unsigned int below thus avoiding signed
-    // types and arithmetic and signed overflows (which are undefined behavior).
-    typedef typename conditional<(ScalarU(-1) > unsigned(-1)),
-                                 ScalarU,
-                                 unsigned>::type ScalarX;
-    // The following difference doesn't overflow, provided our integer types are
-    // two's
-    // complement and have the same number of padding bits in signed and
-    // unsigned variants.
-    // This is the case in most modern implementations of C++.
-    ScalarX range = ScalarX(y) - ScalarX(x);
-    ScalarX offset = 0;
-    ScalarX divisor = 1;
-    ScalarX multiplier = 1;
-    const unsigned rand_max = RAND_MAX;
-    if (range <= rand_max)
-      divisor = (rand_max + 1) / (range + 1);
-    else
-      multiplier = 1 + range / (rand_max + 1);
-    // Rejection sampling.
-    do {
-      offset = (unsigned(std::rand()) * multiplier) / divisor;
-    } while (offset > range);
-    return Scalar(ScalarX(x) + offset);
-  }
-
-  static inline Scalar run() {
-#ifdef EIGEN_MAKING_DOCS
-    return run(Scalar(NumTraits<Scalar>::IsSigned ? -10 : 0), Scalar(10));
-#else
-    enum {
-        rand_bits = meta_floor_log2<(unsigned int)(RAND_MAX) + 1>::value,
-        scalar_bits = sizeof(Scalar) * CHAR_BIT,
-        shift = EIGEN_PLAIN_ENUM_MAX(0, int(rand_bits) - int(scalar_bits)),
-        offset = NumTraits<Scalar>::IsSigned
-                     ? (1 << (EIGEN_PLAIN_ENUM_MIN(rand_bits, scalar_bits) - 1))
-                     : 0};
-    return Scalar((std::rand() >> shift) - offset);
-#endif
-  }
-};
-
-template <typename Scalar>
-struct random_default_impl<Scalar, true, false> {
-  static inline Scalar run(const Scalar& x, const Scalar& y) {
-    return Scalar(random(x.real(), y.real()), random(x.imag(), y.imag()));
-  }
-  static inline Scalar run() {
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-    return Scalar(random<RealScalar>(), random<RealScalar>());
-  }
-};
-
-template <typename Scalar>
-inline EIGEN_MATHFUNC_RETVAL(random, Scalar)
-    random(const Scalar& x, const Scalar& y) {
-  return EIGEN_MATHFUNC_IMPL(random, Scalar)::run(x, y);
-}
-
-template <typename Scalar>
-inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random() {
-  return EIGEN_MATHFUNC_IMPL(random, Scalar)::run();
-}
-
-// Implementation of is* functions
-
-// std::is* do not work with fast-math and gcc, std::is* are available on MSVC
-// 2013 and newer, as well as in clang.
-#if (EIGEN_HAS_CXX11_MATH &&                               \
-     !(EIGEN_COMP_GNUC_STRICT && __FINITE_MATH_ONLY__)) || \
-    (EIGEN_COMP_MSVC >= 1800) || (EIGEN_COMP_CLANG)
-#define EIGEN_USE_STD_FPCLASSIFY 1
-#else
-#define EIGEN_USE_STD_FPCLASSIFY 0
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC
-    typename internal::enable_if<internal::is_integral<T>::value, bool>::type
-    isnan_impl(const T&) {
-  return false;
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC
-    typename internal::enable_if<internal::is_integral<T>::value, bool>::type
-    isinf_impl(const T&) {
-  return false;
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC
-    typename internal::enable_if<internal::is_integral<T>::value, bool>::type
-    isfinite_impl(const T&) {
-  return true;
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC
-    typename internal::enable_if<(!internal::is_integral<T>::value) &&
-                                     (!NumTraits<T>::IsComplex),
-                                 bool>::type
-    isfinite_impl(const T& x) {
-#if defined(EIGEN_GPU_COMPILE_PHASE)
-  return (::isfinite)(x);
-#elif EIGEN_USE_STD_FPCLASSIFY
-  using std::isfinite;
-  return isfinite EIGEN_NOT_A_MACRO(x);
-#else
-  return x <= NumTraits<T>::highest() && x >= NumTraits<T>::lowest();
-#endif
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC
-    typename internal::enable_if<(!internal::is_integral<T>::value) &&
-                                     (!NumTraits<T>::IsComplex),
-                                 bool>::type
-    isinf_impl(const T& x) {
-#if defined(EIGEN_GPU_COMPILE_PHASE)
-  return (::isinf)(x);
-#elif EIGEN_USE_STD_FPCLASSIFY
-  using std::isinf;
-  return isinf EIGEN_NOT_A_MACRO(x);
-#else
-  return x > NumTraits<T>::highest() || x < NumTraits<T>::lowest();
-#endif
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC
-    typename internal::enable_if<(!internal::is_integral<T>::value) &&
-                                     (!NumTraits<T>::IsComplex),
-                                 bool>::type
-    isnan_impl(const T& x) {
-#if defined(EIGEN_GPU_COMPILE_PHASE)
-  return (::isnan)(x);
-#elif EIGEN_USE_STD_FPCLASSIFY
-  using std::isnan;
-  return isnan EIGEN_NOT_A_MACRO(x);
-#else
-  return x != x;
-#endif
-}
-
-#if (!EIGEN_USE_STD_FPCLASSIFY)
-
-#if EIGEN_COMP_MSVC
-
-template <typename T>
-EIGEN_DEVICE_FUNC bool isinf_msvc_helper(T x) {
-  return _fpclass(x) == _FPCLASS_NINF || _fpclass(x) == _FPCLASS_PINF;
-}
-
-// MSVC defines a _isnan builtin function, but for double only
-EIGEN_DEVICE_FUNC inline bool isnan_impl(const long double& x) {
-  return _isnan(x) != 0;
-}
-EIGEN_DEVICE_FUNC inline bool isnan_impl(const double& x) {
-  return _isnan(x) != 0;
-}
-EIGEN_DEVICE_FUNC inline bool isnan_impl(const float& x) {
-  return _isnan(x) != 0;
-}
-
-EIGEN_DEVICE_FUNC inline bool isinf_impl(const long double& x) {
-  return isinf_msvc_helper(x);
-}
-EIGEN_DEVICE_FUNC inline bool isinf_impl(const double& x) {
-  return isinf_msvc_helper(x);
-}
-EIGEN_DEVICE_FUNC inline bool isinf_impl(const float& x) {
-  return isinf_msvc_helper(x);
-}
-
-#elif (defined __FINITE_MATH_ONLY__ && __FINITE_MATH_ONLY__ && EIGEN_COMP_GNUC)
-
-#if EIGEN_GNUC_AT_LEAST(5, 0)
-#define EIGEN_TMP_NOOPT_ATTRIB \
-  EIGEN_DEVICE_FUNC inline __attribute__((optimize("no-finite-math-only")))
-#else
-// NOTE the inline qualifier and noinline attribute are both needed: the former
-// is to avoid linking issue (duplicate symbol),
-//      while the second prevent too aggressive optimizations in fast-math mode:
-#define EIGEN_TMP_NOOPT_ATTRIB \
-  EIGEN_DEVICE_FUNC inline     \
-      __attribute__((noinline, optimize("no-finite-math-only")))
-#endif
-
-template <>
-EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const long double& x) {
-  return __builtin_isnan(x);
-}
-template <>
-EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const double& x) {
-  return __builtin_isnan(x);
-}
-template <>
-EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const float& x) {
-  return __builtin_isnan(x);
-}
-template <>
-EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const double& x) {
-  return __builtin_isinf(x);
-}
-template <>
-EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const float& x) {
-  return __builtin_isinf(x);
-}
-template <>
-EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const long double& x) {
-  return __builtin_isinf(x);
-}
-
-#undef EIGEN_TMP_NOOPT_ATTRIB
-
-#endif
-
-#endif
-
-// The following overload are defined at the end of this file
-template <typename T>
-EIGEN_DEVICE_FUNC bool isfinite_impl(const std::complex<T>& x);
-template <typename T>
-EIGEN_DEVICE_FUNC bool isnan_impl(const std::complex<T>& x);
-template <typename T>
-EIGEN_DEVICE_FUNC bool isinf_impl(const std::complex<T>& x);
-
-template <typename T>
-T generic_fast_tanh_float(const T& a_x);
-}  // end namespace internal
-
-/****************************************************************************
-* Generic math functions                                                    *
-****************************************************************************/
-
-namespace numext {
-
-#if (!defined(EIGEN_GPUCC))
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y) {
-  EIGEN_USING_STD_MATH(min);
-  return min EIGEN_NOT_A_MACRO(x, y);
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y) {
-  EIGEN_USING_STD_MATH(max);
-  return max EIGEN_NOT_A_MACRO(x, y);
-}
-#else
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y) {
-  return y < x ? y : x;
-}
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float mini(const float& x,
-                                                 const float& y) {
-  return fminf(x, y);
-}
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double mini(const double& x,
-                                                  const double& y) {
-  return fmin(x, y);
-}
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE long double mini(const long double& x,
-                                                       const long double& y) {
-#if defined(EIGEN_HIPCC)
-  // no "fminl" on HIP yet
-  return (x < y) ? x : y;
-#else
-  return fminl(x, y);
-#endif
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y) {
-  return x < y ? y : x;
-}
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float maxi(const float& x,
-                                                 const float& y) {
-  return fmaxf(x, y);
-}
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double maxi(const double& x,
-                                                  const double& y) {
-  return fmax(x, y);
-}
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE long double maxi(const long double& x,
-                                                       const long double& y) {
-#if defined(EIGEN_HIPCC)
-  // no "fmaxl" on HIP yet
-  return (x > y) ? x : y;
-#else
-  return fmaxl(x, y);
-#endif
-}
-#endif
-
-#if defined(SYCL_DEVICE_ONLY)
-
-#define SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_char)    \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_short)   \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_int)     \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_long)
-#define SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_char)    \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_short)   \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_int)     \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_long)
-#define SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_uchar)     \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_ushort)    \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_uint)      \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_ulong)
-#define SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_uchar)     \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_ushort)    \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_uint)      \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_ulong)
-#define SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(NAME, FUNC)  \
-  SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
-  SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY(NAME, FUNC)
-#define SYCL_SPECIALIZE_INTEGER_TYPES_UNARY(NAME, FUNC)  \
-  SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
-  SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY(NAME, FUNC)
-#define SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(NAME, FUNC)     \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_float) \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_double)
-#define SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(NAME, FUNC)     \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_float) \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_double)
-#define SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(                \
-    NAME, FUNC, RET_TYPE)                                                  \
-  SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, cl::sycl::cl_float) \
-  SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, cl::sycl::cl_double)
-
-#define SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE)     \
-  template <>                                                              \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE RET_TYPE NAME(const ARG_TYPE& x) { \
-    return cl::sycl::FUNC(x);                                              \
-  }
-
-#define SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, TYPE) \
-  SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, TYPE, TYPE)
-
-#define SYCL_SPECIALIZE_GEN1_BINARY_FUNC(                                   \
-    NAME, FUNC, RET_TYPE, ARG_TYPE1, ARG_TYPE2)                             \
-  template <>                                                               \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE RET_TYPE NAME(const ARG_TYPE1& x,   \
-                                                      const ARG_TYPE2& y) { \
-    return cl::sycl::FUNC(x, y);                                            \
-  }
-
-#define SYCL_SPECIALIZE_GEN2_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE) \
-  SYCL_SPECIALIZE_GEN1_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE, ARG_TYPE)
-
-#define SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, TYPE) \
-  SYCL_SPECIALIZE_GEN2_BINARY_FUNC(NAME, FUNC, TYPE, TYPE)
-
-SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(mini, min)
-SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(mini, fmin)
-SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(maxi, max)
-SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(maxi, fmax)
-
-#endif
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(real, Scalar)
-    real(const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(real, Scalar)::run(x);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline typename internal::add_const_on_value_type<
-    EIGEN_MATHFUNC_RETVAL(real_ref, Scalar)>::type
-real_ref(const Scalar& x) {
-  return internal::real_ref_impl<Scalar>::run(x);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(real_ref, Scalar)
-    real_ref(Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(real_ref, Scalar)::run(x);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(imag, Scalar)
-    imag(const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(imag, Scalar)::run(x);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(arg, Scalar)
-    arg(const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(arg, Scalar)::run(x);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline typename internal::add_const_on_value_type<
-    EIGEN_MATHFUNC_RETVAL(imag_ref, Scalar)>::type
-imag_ref(const Scalar& x) {
-  return internal::imag_ref_impl<Scalar>::run(x);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(imag_ref, Scalar)
-    imag_ref(Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(imag_ref, Scalar)::run(x);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(conj, Scalar)
-    conj(const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(conj, Scalar)::run(x);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(abs2, Scalar)
-    abs2(const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(abs2, Scalar)::run(x);
-}
-
-EIGEN_DEVICE_FUNC
-inline bool abs2(bool x) { return x; }
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T absdiff(const T& x, const T& y) {
-  return x > y ? x - y : y - x;
-}
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float absdiff(const float& x,
-                                                    const float& y) {
-  return fabsf(x - y);
-}
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double absdiff(const double& x,
-                                                     const double& y) {
-  return fabs(x - y);
-}
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE long double absdiff(
-    const long double& x, const long double& y) {
-#if defined(EIGEN_HIPCC)
-  // no "fabsl" on HIP yet
-  return (x > y) ? x : y;
-#else
-  return fabsl(x - y);
-#endif
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(norm1, Scalar)
-    norm1(const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(norm1, Scalar)::run(x);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(hypot, Scalar)
-    hypot(const Scalar& x, const Scalar& y) {
-  return EIGEN_MATHFUNC_IMPL(hypot, Scalar)::run(x, y);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(hypot, hypot)
-#endif
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar)
-    log1p(const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(log1p, Scalar)::run(x);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(log1p, log1p)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float log1p(const float& x) {
-  return ::log1pf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double log1p(const double& x) {
-  return ::log1p(x);
-}
-#endif
-
-template <typename ScalarX, typename ScalarY>
-EIGEN_DEVICE_FUNC inline
-    typename internal::pow_impl<ScalarX, ScalarY>::result_type
-    pow(const ScalarX& x, const ScalarY& y) {
-  return internal::pow_impl<ScalarX, ScalarY>::run(x, y);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(pow, pow)
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC bool(isnan)(const T& x) {
-  return internal::isnan_impl(x);
-}
-template <typename T>
-EIGEN_DEVICE_FUNC bool(isinf)(const T& x) {
-  return internal::isinf_impl(x);
-}
-template <typename T>
-EIGEN_DEVICE_FUNC bool(isfinite)(const T& x) {
-  return internal::isfinite_impl(x);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isnan, isnan, bool)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isinf, isinf, bool)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isfinite, isfinite, bool)
-#endif
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(rint, Scalar)
-    rint(const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(rint, Scalar)::run(x);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(round, Scalar)
-    round(const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(round, Scalar)::run(x);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(round, round)
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC T(floor)(const T& x) {
-  EIGEN_USING_STD_MATH(floor);
-  return floor(x);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(floor, floor)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float floor(const float& x) {
-  return ::floorf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double floor(const double& x) {
-  return ::floor(x);
-}
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC T(ceil)(const T& x) {
-  EIGEN_USING_STD_MATH(ceil);
-  return ceil(x);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(ceil, ceil)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float ceil(const float& x) {
-  return ::ceilf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double ceil(const double& x) {
-  return ::ceil(x);
-}
-#endif
-
-/** Log base 2 for 32 bits positive integers.
-  * Conveniently returns 0 for x==0. */
-inline int log2(int x) {
-  eigen_assert(x >= 0);
-  unsigned int v(x);
-  static const int table[32] = {0,  9,  1,  10, 13, 21, 2,  29, 11, 14, 16,
-                                18, 22, 25, 3,  30, 8,  12, 20, 28, 15, 17,
-                                24, 7,  19, 27, 23, 6,  26, 5,  4,  31};
-  v |= v >> 1;
-  v |= v >> 2;
-  v |= v >> 4;
-  v |= v >> 8;
-  v |= v >> 16;
-  return table[(v * 0x07C4ACDDU) >> 27];
-}
-
-/** \returns the square root of \a x.
-  *
-  * It is essentially equivalent to
-  * \code using std::sqrt; return sqrt(x); \endcode
-  * but slightly faster for float/double and some compilers (e.g., gcc), thanks
- * to
-  * specializations when SSE is enabled.
-  *
-  * It's usage is justified in performance critical functions, like
- * norm/normalize.
-  */
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T sqrt(const T& x) {
-  EIGEN_USING_STD_MATH(sqrt);
-  return sqrt(x);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sqrt, sqrt)
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T log(const T& x) {
-  EIGEN_USING_STD_MATH(log);
-  return log(x);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(log, log)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float log(const float& x) {
-  return ::logf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double log(const double& x) {
-  return ::log(x);
-}
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-    typename internal::enable_if<NumTraits<T>::IsSigned ||
-                                     NumTraits<T>::IsComplex,
-                                 typename NumTraits<T>::Real>::type
-    abs(const T& x) {
-  EIGEN_USING_STD_MATH(abs);
-  return abs(x);
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-    typename internal::enable_if<!(NumTraits<T>::IsSigned ||
-                                   NumTraits<T>::IsComplex),
-                                 typename NumTraits<T>::Real>::type
-    abs(const T& x) {
-  return x;
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_INTEGER_TYPES_UNARY(abs, abs)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(abs, fabs)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float abs(const float& x) {
-  return ::fabsf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double abs(const double& x) {
-  return ::fabs(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float abs(const std::complex<float>& x) {
-  return ::hypotf(x.real(), x.imag());
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double abs(
-    const std::complex<double>& x) {
-  return ::hypot(x.real(), x.imag());
-}
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T exp(const T& x) {
-  EIGEN_USING_STD_MATH(exp);
-  return exp(x);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(exp, exp)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float exp(const float& x) {
-  return ::expf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double exp(const double& x) {
-  return ::exp(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::complex<float> exp(
-    const std::complex<float>& x) {
-  float com = ::expf(x.real());
-  float res_real = com * ::cosf(x.imag());
-  float res_imag = com * ::sinf(x.imag());
-  return std::complex<float>(res_real, res_imag);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::complex<double> exp(
-    const std::complex<double>& x) {
-  double com = ::exp(x.real());
-  double res_real = com * ::cos(x.imag());
-  double res_imag = com * ::sin(x.imag());
-  return std::complex<double>(res_real, res_imag);
-}
-#endif
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(expm1, Scalar)
-    expm1(const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(expm1, Scalar)::run(x);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(expm1, expm1)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float expm1(const float& x) {
-  return ::expm1f(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double expm1(const double& x) {
-  return ::expm1(x);
-}
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T cos(const T& x) {
-  EIGEN_USING_STD_MATH(cos);
-  return cos(x);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(cos, cos)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float cos(const float& x) {
-  return ::cosf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double cos(const double& x) {
-  return ::cos(x);
-}
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T sin(const T& x) {
-  EIGEN_USING_STD_MATH(sin);
-  return sin(x);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sin, sin)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sin(const float& x) {
-  return ::sinf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double sin(const double& x) {
-  return ::sin(x);
-}
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T tan(const T& x) {
-  EIGEN_USING_STD_MATH(tan);
-  return tan(x);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(tan, tan)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tan(const float& x) {
-  return ::tanf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double tan(const double& x) {
-  return ::tan(x);
-}
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T acos(const T& x) {
-  EIGEN_USING_STD_MATH(acos);
-  return acos(x);
-}
-
-#if EIGEN_HAS_CXX11_MATH
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T acosh(const T& x) {
-  EIGEN_USING_STD_MATH(acosh);
-  return acosh(x);
-}
-#endif
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(acos, acos)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(acosh, acosh)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float acos(const float& x) {
-  return ::acosf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double acos(const double& x) {
-  return ::acos(x);
-}
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T asin(const T& x) {
-  EIGEN_USING_STD_MATH(asin);
-  return asin(x);
-}
-
-#if EIGEN_HAS_CXX11_MATH
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T asinh(const T& x) {
-  EIGEN_USING_STD_MATH(asinh);
-  return asinh(x);
-}
-#endif
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(asin, asin)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(asinh, asinh)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float asin(const float& x) {
-  return ::asinf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double asin(const double& x) {
-  return ::asin(x);
-}
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T atan(const T& x) {
-  EIGEN_USING_STD_MATH(atan);
-  return atan(x);
-}
-
-#if EIGEN_HAS_CXX11_MATH
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T atanh(const T& x) {
-  EIGEN_USING_STD_MATH(atanh);
-  return atanh(x);
-}
-#endif
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(atan, atan)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(atanh, atanh)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float atan(const float& x) {
-  return ::atanf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double atan(const double& x) {
-  return ::atan(x);
-}
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T cosh(const T& x) {
-  EIGEN_USING_STD_MATH(cosh);
-  return cosh(x);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(cosh, cosh)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float cosh(const float& x) {
-  return ::coshf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double cosh(const double& x) {
-  return ::cosh(x);
-}
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T sinh(const T& x) {
-  EIGEN_USING_STD_MATH(sinh);
-  return sinh(x);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sinh, sinh)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sinh(const float& x) {
-  return ::sinhf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double sinh(const double& x) {
-  return ::sinh(x);
-}
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T tanh(const T& x) {
-  EIGEN_USING_STD_MATH(tanh);
-  return tanh(x);
-}
-
-#if (!defined(EIGEN_GPUCC)) && EIGEN_FAST_MATH && !defined(SYCL_DEVICE_ONLY)
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tanh(float x) {
-  return internal::generic_fast_tanh_float(x);
-}
-#endif
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(tanh, tanh)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tanh(const float& x) {
-  return ::tanhf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double tanh(const double& x) {
-  return ::tanh(x);
-}
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T fmod(const T& a, const T& b) {
-  EIGEN_USING_STD_MATH(fmod);
-  return fmod(a, b);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(fmod, fmod)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float fmod(const float& a,
-                                                 const float& b) {
-  return ::fmodf(a, b);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double fmod(const double& a,
-                                                  const double& b) {
-  return ::fmod(a, b);
-}
-#endif
-
-#if defined(SYCL_DEVICE_ONLY)
-#undef SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY
-#undef SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY
-#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY
-#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY
-#undef SYCL_SPECIALIZE_INTEGER_TYPES_BINARY
-#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY
-#undef SYCL_SPECIALIZE_FLOATING_TYPES_BINARY
-#undef SYCL_SPECIALIZE_FLOATING_TYPES_UNARY
-#undef SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE
-#undef SYCL_SPECIALIZE_GEN_UNARY_FUNC
-#undef SYCL_SPECIALIZE_UNARY_FUNC
-#undef SYCL_SPECIALIZE_GEN1_BINARY_FUNC
-#undef SYCL_SPECIALIZE_GEN2_BINARY_FUNC
-#undef SYCL_SPECIALIZE_BINARY_FUNC
-#endif
-
-}  // end namespace numext
-
-namespace internal {
-
-template <typename T>
-EIGEN_DEVICE_FUNC bool isfinite_impl(const std::complex<T>& x) {
-  return (numext::isfinite)(numext::real(x)) &&
-         (numext::isfinite)(numext::imag(x));
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC bool isnan_impl(const std::complex<T>& x) {
-  return (numext::isnan)(numext::real(x)) || (numext::isnan)(numext::imag(x));
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC bool isinf_impl(const std::complex<T>& x) {
-  return ((numext::isinf)(numext::real(x)) ||
-          (numext::isinf)(numext::imag(x))) &&
-         (!(numext::isnan)(x));
-}
-
-/****************************************************************************
-* Implementation of fuzzy comparisons                                       *
-****************************************************************************/
-
-template <typename Scalar, bool IsComplex, bool IsInteger>
-struct scalar_fuzzy_default_impl {};
-
-template <typename Scalar>
-struct scalar_fuzzy_default_impl<Scalar, false, false> {
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  template <typename OtherScalar>
-  EIGEN_DEVICE_FUNC static inline bool isMuchSmallerThan(
-      const Scalar& x, const OtherScalar& y, const RealScalar& prec) {
-    return numext::abs(x) <= numext::abs(y) * prec;
-  }
-  EIGEN_DEVICE_FUNC
-  static inline bool isApprox(const Scalar& x,
-                              const Scalar& y,
-                              const RealScalar& prec) {
-    return numext::abs(x - y) <=
-           numext::mini(numext::abs(x), numext::abs(y)) * prec;
-  }
-  EIGEN_DEVICE_FUNC
-  static inline bool isApproxOrLessThan(const Scalar& x,
-                                        const Scalar& y,
-                                        const RealScalar& prec) {
-    return x <= y || isApprox(x, y, prec);
-  }
-};
-
-template <typename Scalar>
-struct scalar_fuzzy_default_impl<Scalar, false, true> {
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  template <typename OtherScalar>
-  EIGEN_DEVICE_FUNC static inline bool isMuchSmallerThan(const Scalar& x,
-                                                         const Scalar&,
-                                                         const RealScalar&) {
-    return x == Scalar(0);
-  }
-  EIGEN_DEVICE_FUNC
-  static inline bool isApprox(const Scalar& x,
-                              const Scalar& y,
-                              const RealScalar&) {
-    return x == y;
-  }
-  EIGEN_DEVICE_FUNC
-  static inline bool isApproxOrLessThan(const Scalar& x,
-                                        const Scalar& y,
-                                        const RealScalar&) {
-    return x <= y;
-  }
-};
-
-template <typename Scalar>
-struct scalar_fuzzy_default_impl<Scalar, true, false> {
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  template <typename OtherScalar>
-  EIGEN_DEVICE_FUNC static inline bool isMuchSmallerThan(
-      const Scalar& x, const OtherScalar& y, const RealScalar& prec) {
-    return numext::abs2(x) <= numext::abs2(y) * prec * prec;
-  }
-  EIGEN_DEVICE_FUNC
-  static inline bool isApprox(const Scalar& x,
-                              const Scalar& y,
-                              const RealScalar& prec) {
-    return numext::abs2(x - y) <=
-           numext::mini(numext::abs2(x), numext::abs2(y)) * prec * prec;
-  }
-};
-
-template <typename Scalar>
-struct scalar_fuzzy_impl
-    : scalar_fuzzy_default_impl<Scalar,
-                                NumTraits<Scalar>::IsComplex,
-                                NumTraits<Scalar>::IsInteger> {};
-
-template <typename Scalar, typename OtherScalar>
-EIGEN_DEVICE_FUNC inline bool isMuchSmallerThan(
-    const Scalar& x,
-    const OtherScalar& y,
-    const typename NumTraits<Scalar>::Real& precision =
-        NumTraits<Scalar>::dummy_precision()) {
-  return scalar_fuzzy_impl<Scalar>::template isMuchSmallerThan<OtherScalar>(
-      x, y, precision);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline bool isApprox(
-    const Scalar& x,
-    const Scalar& y,
-    const typename NumTraits<Scalar>::Real& precision =
-        NumTraits<Scalar>::dummy_precision()) {
-  return scalar_fuzzy_impl<Scalar>::isApprox(x, y, precision);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline bool isApproxOrLessThan(
-    const Scalar& x,
-    const Scalar& y,
-    const typename NumTraits<Scalar>::Real& precision =
-        NumTraits<Scalar>::dummy_precision()) {
-  return scalar_fuzzy_impl<Scalar>::isApproxOrLessThan(x, y, precision);
-}
-
-/******************************************
-***  The special case of the  bool type ***
-******************************************/
-
-template <>
-struct random_impl<bool> {
-  static inline bool run() { return random<int>(0, 1) == 0 ? false : true; }
-};
-
-template <>
-struct scalar_fuzzy_impl<bool> {
-  typedef bool RealScalar;
-
-  template <typename OtherScalar>
-  EIGEN_DEVICE_FUNC static inline bool isMuchSmallerThan(const bool& x,
-                                                         const bool&,
-                                                         const bool&) {
-    return !x;
-  }
-
-  EIGEN_DEVICE_FUNC
-  static inline bool isApprox(bool x, bool y, bool) { return x == y; }
-
-  EIGEN_DEVICE_FUNC
-  static inline bool isApproxOrLessThan(const bool& x,
-                                        const bool& y,
-                                        const bool&) {
-    return (!x) || y;
-  }
-};
-
-}  // end namespace internal
-
-}  // end namespace Eigen
-
-#endif  // EIGEN_MATHFUNCTIONS_H
--- a/patches/eigen/Meta.h
+++ b/patches/eigen/Meta.h
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
-// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// clang-format off
-
-#ifndef EIGEN_META_H
-#define EIGEN_META_H
-
-#if defined(EIGEN_GPU_COMPILE_PHASE)
-
- #include <cfloat>
-
- #if defined(EIGEN_CUDA_ARCH)
-  #include <math_constants.h>
- #endif
-
- #if defined(EIGEN_HIP_DEVICE_COMPILE)
-  #include "Eigen/src/Core/arch/HIP/hcc/math_constants.h"
-  #endif
-
-#endif
-
-#if EIGEN_COMP_ICC>=1600 &&  __cplusplus >= 201103L
-#include <cstdint>
-#endif
-
-namespace Eigen {
-
-typedef EIGEN_DEFAULT_DENSE_INDEX_TYPE DenseIndex;
-
-/**
- * \brief The Index type as used for the API.
- * \details To change this, \c \#define the preprocessor symbol \c EIGEN_DEFAULT_DENSE_INDEX_TYPE.
- * \sa \blank \ref TopicPreprocessorDirectives, StorageIndex.
- */
-
-typedef EIGEN_DEFAULT_DENSE_INDEX_TYPE Index;
-
-namespace internal {
-
-/** \internal
-  * \file Meta.h
-  * This file contains generic metaprogramming classes which are not specifically related to Eigen.
-  * \note In case you wonder, yes we're aware that Boost already provides all these features,
-  * we however don't want to add a dependency to Boost.
-  */
-
-// Only recent versions of ICC complain about using ptrdiff_t to hold pointers,
-// and older versions do not provide *intptr_t types.
-#if EIGEN_COMP_ICC>=1600 &&  __cplusplus >= 201103L
-typedef std::intptr_t  IntPtr;
-typedef std::uintptr_t UIntPtr;
-#else
-typedef std::ptrdiff_t IntPtr;
-typedef std::size_t UIntPtr;
-#endif
-
-struct true_type {  enum { value = 1 }; };
-struct false_type { enum { value = 0 }; };
-
-template<bool Condition>
-struct bool_constant;
-
-template<>
-struct bool_constant<true> : true_type {};
-
-template<>
-struct bool_constant<false> : false_type {};
-
-template<bool Condition, typename Then, typename Else>
-struct conditional { typedef Then type; };
-
-template<typename Then, typename Else>
-struct conditional <false, Then, Else> { typedef Else type; };
-
-template<typename T> struct remove_reference { typedef T type; };
-template<typename T> struct remove_reference<T&> { typedef T type; };
-
-template<typename T> struct remove_pointer { typedef T type; };
-template<typename T> struct remove_pointer<T*> { typedef T type; };
-template<typename T> struct remove_pointer<T*const> { typedef T type; };
-
-template <class T> struct remove_const { typedef T type; };
-template <class T> struct remove_const<const T> { typedef T type; };
-template <class T> struct remove_const<const T[]> { typedef T type[]; };
-template <class T, unsigned int Size> struct remove_const<const T[Size]> { typedef T type[Size]; };
-
-template<typename T> struct remove_all { typedef T type; };
-template<typename T> struct remove_all<const T>   { typedef typename remove_all<T>::type type; };
-template<typename T> struct remove_all<T const&>  { typedef typename remove_all<T>::type type; };
-template<typename T> struct remove_all<T&>        { typedef typename remove_all<T>::type type; };
-template<typename T> struct remove_all<T const*>  { typedef typename remove_all<T>::type type; };
-template<typename T> struct remove_all<T*>        { typedef typename remove_all<T>::type type; };
-
-template<typename T> struct is_arithmetic      { enum { value = false }; };
-template<> struct is_arithmetic<float>         { enum { value = true }; };
-template<> struct is_arithmetic<double>        { enum { value = true }; };
-template<> struct is_arithmetic<long double>   { enum { value = true }; };
-template<> struct is_arithmetic<bool>          { enum { value = true }; };
-template<> struct is_arithmetic<char>          { enum { value = true }; };
-template<> struct is_arithmetic<signed char>   { enum { value = true }; };
-template<> struct is_arithmetic<unsigned char> { enum { value = true }; };
-template<> struct is_arithmetic<signed short>  { enum { value = true }; };
-template<> struct is_arithmetic<unsigned short>{ enum { value = true }; };
-template<> struct is_arithmetic<signed int>    { enum { value = true }; };
-template<> struct is_arithmetic<unsigned int>  { enum { value = true }; };
-template<> struct is_arithmetic<signed long>   { enum { value = true }; };
-template<> struct is_arithmetic<unsigned long> { enum { value = true }; };
-
-template<typename T, typename U> struct is_same { enum { value = 0 }; };
-template<typename T> struct is_same<T,T> { enum { value = 1 }; };
-
-template< class T >
-struct is_void : is_same<void, typename remove_const<T>::type> {};
-
-#if EIGEN_HAS_CXX11
-template<> struct is_arithmetic<signed long long>   { enum { value = true }; };
-template<> struct is_arithmetic<unsigned long long> { enum { value = true }; };
-using std::is_integral;
-#else
-template<typename T> struct is_integral               { enum { value = false }; };
-template<> struct is_integral<bool>                   { enum { value = true }; };
-template<> struct is_integral<char>                   { enum { value = true }; };
-template<> struct is_integral<signed char>            { enum { value = true }; };
-template<> struct is_integral<unsigned char>          { enum { value = true }; };
-template<> struct is_integral<signed short>           { enum { value = true }; };
-template<> struct is_integral<unsigned short>         { enum { value = true }; };
-template<> struct is_integral<signed int>             { enum { value = true }; };
-template<> struct is_integral<unsigned int>           { enum { value = true }; };
-template<> struct is_integral<signed long>            { enum { value = true }; };
-template<> struct is_integral<unsigned long>          { enum { value = true }; };
-#if EIGEN_COMP_MSVC
-template<> struct is_integral<signed __int64>         { enum { value = true }; };
-template<> struct is_integral<unsigned __int64>       { enum { value = true }; };
-#endif
-#endif
-
-#if EIGEN_HAS_CXX11
-using std::make_unsigned;
-#else
-// TODO: Possibly improve this implementation of make_unsigned.
-// It is currently used only by
-// template<typename Scalar> struct random_default_impl<Scalar, false, true>.
-template<typename> struct make_unsigned;
-template<> struct make_unsigned<char>             { typedef unsigned char type; };
-template<> struct make_unsigned<signed char>      { typedef unsigned char type; };
-template<> struct make_unsigned<unsigned char>    { typedef unsigned char type; };
-template<> struct make_unsigned<signed short>     { typedef unsigned short type; };
-template<> struct make_unsigned<unsigned short>   { typedef unsigned short type; };
-template<> struct make_unsigned<signed int>       { typedef unsigned int type; };
-template<> struct make_unsigned<unsigned int>     { typedef unsigned int type; };
-template<> struct make_unsigned<signed long>      { typedef unsigned long type; };
-template<> struct make_unsigned<unsigned long>    { typedef unsigned long type; };
-#if EIGEN_COMP_MSVC
-template<> struct make_unsigned<signed __int64>   { typedef unsigned __int64 type; };
-template<> struct make_unsigned<unsigned __int64> { typedef unsigned __int64 type; };
-#endif
-#endif
-
-template <typename T> struct add_const { typedef const T type; };
-template <typename T> struct add_const<T&> { typedef T& type; };
-
-template <typename T> struct is_const { enum { value = 0 }; };
-template <typename T> struct is_const<T const> { enum { value = 1 }; };
-
-template<typename T> struct add_const_on_value_type            { typedef const T type;  };
-template<typename T> struct add_const_on_value_type<T&>        { typedef T const& type; };
-template<typename T> struct add_const_on_value_type<T*>        { typedef T const* type; };
-template<typename T> struct add_const_on_value_type<T* const>  { typedef T const* const type; };
-template<typename T> struct add_const_on_value_type<T const* const>  { typedef T const* const type; };
-
-#if EIGEN_HAS_CXX11
-
-using std::is_convertible;
-
-#else
-
-template<typename From, typename To>
-struct is_convertible_impl
-{
-private:
-  struct any_conversion
-  {
-    template <typename T> any_conversion(const volatile T&);
-    template <typename T> any_conversion(T&);
-  };
-  struct yes {int a[1];};
-  struct no  {int a[2];};
-
-  template<typename T>
-  static yes test(T, int);
-
-  template<typename T>
-  static no  test(any_conversion, ...);
-
-public:
-  static typename internal::remove_reference<From>::type* ms_from;
-#ifdef __INTEL_COMPILER
-  #pragma warning push
-  #pragma warning ( disable : 2259 )
-#endif
-  enum { value = sizeof(test<To>(*ms_from, 0))==sizeof(yes) };
-#ifdef __INTEL_COMPILER
-  #pragma warning pop
-#endif
-};
-
-template<typename From, typename To>
-struct is_convertible
-{
-  enum { value = is_convertible_impl<From,To>::value };
-};
-
-template<typename T>
-struct is_convertible<T,T&> { enum { value = false }; };
-
-template<typename T>
-struct is_convertible<const T,const T&> { enum { value = true }; };
-
-#endif
-
-/** \internal Allows to enable/disable an overload
-  * according to a compile time condition.
-  */
-template<bool Condition, typename T=void> struct enable_if;
-
-template<typename T> struct enable_if<true,T>
-{ typedef T type; };
-
-#if defined(EIGEN_GPU_COMPILE_PHASE)
-#if !defined(__FLT_EPSILON__)
-#define __FLT_EPSILON__ FLT_EPSILON
-#define __DBL_EPSILON__ DBL_EPSILON
-#endif
-
-namespace device {
-
-template<typename T> struct numeric_limits
-{
-  EIGEN_DEVICE_FUNC static T epsilon() { return 0; }
-  EIGEN_DEVICE_FUNC static T (max)() { assert(false && "Highest not supported for this type"); }
-  EIGEN_DEVICE_FUNC static T (min)() { assert(false && "Lowest not supported for this type"); }
-  EIGEN_DEVICE_FUNC static T infinity() { assert(false && "Infinity not supported for this type"); }
-  EIGEN_DEVICE_FUNC static T quiet_NaN() { assert(false && "quiet_NaN not supported for this type"); }
-};
-template<> struct numeric_limits<float>
-{
-  EIGEN_DEVICE_FUNC
-  static float epsilon() { return __FLT_EPSILON__; }
-  EIGEN_DEVICE_FUNC
-  static float (max)() {
-  #if defined(EIGEN_CUDA_ARCH)
-    return CUDART_MAX_NORMAL_F;
-  #else
-    return HIPRT_MAX_NORMAL_F;
-  #endif
-  }
-  EIGEN_DEVICE_FUNC
-  static float (min)() { return FLT_MIN; }
-  EIGEN_DEVICE_FUNC
-  static float infinity() {
-  #if defined(EIGEN_CUDA_ARCH)
-    return CUDART_INF_F;
-  #else
-    return HIPRT_INF_F;
-  #endif
-  }
-  EIGEN_DEVICE_FUNC
-  static float quiet_NaN() {
-  #if defined(EIGEN_CUDA_ARCH)
-    return CUDART_NAN_F;
-  #else
-    return HIPRT_NAN_F;
-  #endif
-  }
-};
-template<> struct numeric_limits<double>
-{
-  EIGEN_DEVICE_FUNC
-  static double epsilon() { return __DBL_EPSILON__; }
-  EIGEN_DEVICE_FUNC
-  static double (max)() { return DBL_MAX; }
-  EIGEN_DEVICE_FUNC
-  static double (min)() { return DBL_MIN; }
-  EIGEN_DEVICE_FUNC
-  static double infinity() {
-  #if defined(EIGEN_CUDA_ARCH)
-    return CUDART_INF;
-  #else
-    return HIPRT_INF;
-  #endif
-  }
-  EIGEN_DEVICE_FUNC
-  static double quiet_NaN() {
-  #if defined(EIGEN_CUDA_ARCH)
-    return CUDART_NAN;
-  #else
-    return HIPRT_NAN;
-  #endif
-  }
-};
-template<> struct numeric_limits<int>
-{
-  EIGEN_DEVICE_FUNC
-  static int epsilon() { return 0; }
-  EIGEN_DEVICE_FUNC
-  static int (max)() { return INT_MAX; }
-  EIGEN_DEVICE_FUNC
-  static int (min)() { return INT_MIN; }
-};
-template<> struct numeric_limits<unsigned int>
-{
-  EIGEN_DEVICE_FUNC
-  static unsigned int epsilon() { return 0; }
-  EIGEN_DEVICE_FUNC
-  static unsigned int (max)() { return UINT_MAX; }
-  EIGEN_DEVICE_FUNC
-  static unsigned int (min)() { return 0; }
-};
-template<> struct numeric_limits<long>
-{
-  EIGEN_DEVICE_FUNC
-  static long epsilon() { return 0; }
-  EIGEN_DEVICE_FUNC
-  static long (max)() { return LONG_MAX; }
-  EIGEN_DEVICE_FUNC
-  static long (min)() { return LONG_MIN; }
-};
-template<> struct numeric_limits<unsigned long>
-{
-  EIGEN_DEVICE_FUNC
-  static unsigned long epsilon() { return 0; }
-  EIGEN_DEVICE_FUNC
-  static unsigned long (max)() { return ULONG_MAX; }
-  EIGEN_DEVICE_FUNC
-  static unsigned long (min)() { return 0; }
-};
-template<> struct numeric_limits<long long>
-{
-  EIGEN_DEVICE_FUNC
-  static long long epsilon() { return 0; }
-  EIGEN_DEVICE_FUNC
-  static long long (max)() { return LLONG_MAX; }
-  EIGEN_DEVICE_FUNC
-  static long long (min)() { return LLONG_MIN; }
-};
-template<> struct numeric_limits<unsigned long long>
-{
-  EIGEN_DEVICE_FUNC
-  static unsigned long long epsilon() { return 0; }
-  EIGEN_DEVICE_FUNC
-  static unsigned long long (max)() { return ULLONG_MAX; }
-  EIGEN_DEVICE_FUNC
-  static unsigned long long (min)() { return 0; }
-};
-
-}
-
-#endif
-
-/** \internal
-  * A base class do disable default copy ctor and copy assignment operator.
-  */
-class noncopyable
-{
-  EIGEN_DEVICE_FUNC noncopyable(const noncopyable&);
-  EIGEN_DEVICE_FUNC const noncopyable& operator=(const noncopyable&);
-protected:
-  EIGEN_DEVICE_FUNC noncopyable() {}
-  EIGEN_DEVICE_FUNC ~noncopyable() {}
-};
-
-/** \internal
-  * Provides access to the number of elements in the object of as a compile-time constant expression.
-  * It "returns" Eigen::Dynamic if the size cannot be resolved at compile-time (default).
-  *
-  * Similar to std::tuple_size, but more general.
-  *
-  * It currently supports:
-  *  - any types T defining T::SizeAtCompileTime
-  *  - plain C arrays as T[N]
-  *  - std::array (c++11)
-  *  - some internal types such as SingleRange and AllRange
-  *
-  * The second template parameter eases SFINAE-based specializations.
-  */
-template<typename T, typename EnableIf = void> struct array_size {
-  enum { value = Dynamic };
-};
-
-template<typename T> struct array_size<T,typename internal::enable_if<((T::SizeAtCompileTime&0)==0)>::type> {
-  enum { value = T::SizeAtCompileTime };
-};
-
-template<typename T, int N> struct array_size<const T (&)[N]> {
-  enum { value = N };
-};
-template<typename T, int N> struct array_size<T (&)[N]> {
-  enum { value = N };
-};
-
-#if EIGEN_HAS_CXX11
-template<typename T, std::size_t N> struct array_size<const std::array<T,N> > {
-  enum { value = N };
-};
-template<typename T, std::size_t N> struct array_size<std::array<T,N> > {
-  enum { value = N };
-};
-#endif
-
-/** \internal
-  * Analogue of the std::size free function.
-  * It returns the size of the container or view \a x of type \c T
-  *
-  * It currently supports:
-  *  - any types T defining a member T::size() const
-  *  - plain C arrays as T[N]
-  *
-  */
-template<typename T>
-Index size(const T& x) { return x.size(); }
-
-template<typename T,std::size_t N>
-Index size(const T (&) [N]) { return N; }
-
-/** \internal
-  * Convenient struct to get the result type of a unary or binary functor.
-  *
-  * It supports both the current STL mechanism (using the result_type member) as well as
-  * upcoming next STL generation (using a templated result member).
-  * If none of these members is provided, then the type of the first argument is returned. FIXME, that behavior is a pretty bad hack.
-  */
-#if EIGEN_HAS_STD_RESULT_OF
-template<typename T> struct result_of {
-  typedef typename std::result_of<T>::type type1;
-  typedef typename remove_all<type1>::type type;
-};
-#else
-template<typename T> struct result_of { };
-
-struct has_none {int a[1];};
-struct has_std_result_type {int a[2];};
-struct has_tr1_result {int a[3];};
-
-template<typename Func, typename ArgType, int SizeOf=sizeof(has_none)>
-struct unary_result_of_select {typedef typename internal::remove_all<ArgType>::type type;};
-
-template<typename Func, typename ArgType>
-struct unary_result_of_select<Func, ArgType, sizeof(has_std_result_type)> {typedef typename Func::result_type type;};
-
-template<typename Func, typename ArgType>
-struct unary_result_of_select<Func, ArgType, sizeof(has_tr1_result)> {typedef typename Func::template result<Func(ArgType)>::type type;};
-
-template<typename Func, typename ArgType>
-struct result_of<Func(ArgType)> {
-    template<typename T>
-    static has_std_result_type    testFunctor(T const *, typename T::result_type const * = 0);
-    template<typename T>
-    static has_tr1_result         testFunctor(T const *, typename T::template result<T(ArgType)>::type const * = 0);
-    static has_none               testFunctor(...);
-
-    // note that the following indirection is needed for gcc-3.3
-    enum {FunctorType = sizeof(testFunctor(static_cast<Func*>(0)))};
-    typedef typename unary_result_of_select<Func, ArgType, FunctorType>::type type;
-};
-
-template<typename Func, typename ArgType0, typename ArgType1, int SizeOf=sizeof(has_none)>
-struct binary_result_of_select {typedef typename internal::remove_all<ArgType0>::type type;};
-
-template<typename Func, typename ArgType0, typename ArgType1>
-struct binary_result_of_select<Func, ArgType0, ArgType1, sizeof(has_std_result_type)>
-{typedef typename Func::result_type type;};
-
-template<typename Func, typename ArgType0, typename ArgType1>
-struct binary_result_of_select<Func, ArgType0, ArgType1, sizeof(has_tr1_result)>
-{typedef typename Func::template result<Func(ArgType0,ArgType1)>::type type;};
-
-template<typename Func, typename ArgType0, typename ArgType1>
-struct result_of<Func(ArgType0,ArgType1)> {
-    template<typename T>
-    static has_std_result_type    testFunctor(T const *, typename T::result_type const * = 0);
-    template<typename T>
-    static has_tr1_result         testFunctor(T const *, typename T::template result<T(ArgType0,ArgType1)>::type const * = 0);
-    static has_none               testFunctor(...);
-
-    // note that the following indirection is needed for gcc-3.3
-    enum {FunctorType = sizeof(testFunctor(static_cast<Func*>(0)))};
-    typedef typename binary_result_of_select<Func, ArgType0, ArgType1, FunctorType>::type type;
-};
-
-template<typename Func, typename ArgType0, typename ArgType1, typename ArgType2, int SizeOf=sizeof(has_none)>
-struct ternary_result_of_select {typedef typename internal::remove_all<ArgType0>::type type;};
-
-template<typename Func, typename ArgType0, typename ArgType1, typename ArgType2>
-struct ternary_result_of_select<Func, ArgType0, ArgType1, ArgType2, sizeof(has_std_result_type)>
-{typedef typename Func::result_type type;};
-
-template<typename Func, typename ArgType0, typename ArgType1, typename ArgType2>
-struct ternary_result_of_select<Func, ArgType0, ArgType1, ArgType2, sizeof(has_tr1_result)>
-{typedef typename Func::template result<Func(ArgType0,ArgType1,ArgType2)>::type type;};
-
-template<typename Func, typename ArgType0, typename ArgType1, typename ArgType2>
-struct result_of<Func(ArgType0,ArgType1,ArgType2)> {
-    template<typename T>
-    static has_std_result_type    testFunctor(T const *, typename T::result_type const * = 0);
-    template<typename T>
-    static has_tr1_result         testFunctor(T const *, typename T::template result<T(ArgType0,ArgType1,ArgType2)>::type const * = 0);
-    static has_none               testFunctor(...);
-
-    // note that the following indirection is needed for gcc-3.3
-    enum {FunctorType = sizeof(testFunctor(static_cast<Func*>(0)))};
-    typedef typename ternary_result_of_select<Func, ArgType0, ArgType1, ArgType2, FunctorType>::type type;
-};
-#endif
-
-struct meta_yes { char a[1]; };
-struct meta_no  { char a[2]; };
-
-// Check whether T::ReturnType does exist
-template <typename T>
-struct has_ReturnType
-{
-  template <typename C> static meta_yes testFunctor(C const *, typename C::ReturnType const * = 0);
-  template <typename C> static meta_no  testFunctor(...);
-
-  enum { value = sizeof(testFunctor<T>(static_cast<T*>(0))) == sizeof(meta_yes) };
-};
-
-template<typename T> const T* return_ptr();
-
-template <typename T, typename IndexType=Index>
-struct has_nullary_operator
-{
-  template <typename C> static meta_yes testFunctor(C const *,typename enable_if<(sizeof(return_ptr<C>()->operator()())>0)>::type * = 0);
-  static meta_no testFunctor(...);
-
-  enum { value = sizeof(testFunctor(static_cast<T*>(0))) == sizeof(meta_yes) };
-};
-
-template <typename T, typename IndexType=Index>
-struct has_unary_operator
-{
-  template <typename C> static meta_yes testFunctor(C const *,typename enable_if<(sizeof(return_ptr<C>()->operator()(IndexType(0)))>0)>::type * = 0);
-  static meta_no testFunctor(...);
-
-  enum { value = sizeof(testFunctor(static_cast<T*>(0))) == sizeof(meta_yes) };
-};
-
-template <typename T, typename IndexType=Index>
-struct has_binary_operator
-{
-  template <typename C> static meta_yes testFunctor(C const *,typename enable_if<(sizeof(return_ptr<C>()->operator()(IndexType(0),IndexType(0)))>0)>::type * = 0);
-  static meta_no testFunctor(...);
-
-  enum { value = sizeof(testFunctor(static_cast<T*>(0))) == sizeof(meta_yes) };
-};
-
-/** \internal In short, it computes int(sqrt(\a Y)) with \a Y an integer.
-  * Usage example: \code meta_sqrt<1023>::ret \endcode
-  */
-template<int Y,
-         int InfX = 0,
-         int SupX = ((Y==1) ? 1 : Y/2),
-         bool Done = ((SupX-InfX)<=1 ? true : ((SupX*SupX <= Y) && ((SupX+1)*(SupX+1) > Y))) >
-                                // use ?: instead of || just to shut up a stupid gcc 4.3 warning
-class meta_sqrt
-{
-    enum {
-      MidX = (InfX+SupX)/2,
-      TakeInf = MidX*MidX > Y ? 1 : 0,
-      NewInf = int(TakeInf) ? InfX : int(MidX),
-      NewSup = int(TakeInf) ? int(MidX) : SupX
-    };
-  public:
-    enum { ret = meta_sqrt<Y,NewInf,NewSup>::ret };
-};
-
-template<int Y, int InfX, int SupX>
-class meta_sqrt<Y, InfX, SupX, true> { public:  enum { ret = (SupX*SupX <= Y) ? SupX : InfX }; };
-
-
-/** \internal Computes the least common multiple of two positive integer A and B
-  * at compile-time. It implements a naive algorithm testing all multiples of A.
-  * It thus works better if A>=B.
-  */
-template<int A, int B, int K=1, bool Done = ((A*K)%B)==0>
-struct meta_least_common_multiple
-{
-  enum { ret = meta_least_common_multiple<A,B,K+1>::ret };
-};
-template<int A, int B, int K>
-struct meta_least_common_multiple<A,B,K,true>
-{
-  enum { ret = A*K };
-};
-
-/** \internal determines whether the product of two numeric types is allowed and what the return type is */
-template<typename T, typename U> struct scalar_product_traits
-{
-  enum { Defined = 0 };
-};
-
-// FIXME quick workaround around current limitation of result_of
-// template<typename Scalar, typename ArgType0, typename ArgType1>
-// struct result_of<scalar_product_op<Scalar>(ArgType0,ArgType1)> {
-// typedef typename scalar_product_traits<typename remove_all<ArgType0>::type, typename remove_all<ArgType1>::type>::ReturnType type;
-// };
-
-/** \internal Obtains a POD type suitable to use as storage for an object of a size
-  * of at most Len bytes, aligned as specified by \c Align.
-  */
-template<unsigned Len, unsigned Align>
-struct aligned_storage {
-  struct type {
-    EIGEN_ALIGN_TO_BOUNDARY(Align) unsigned char data[Len];
-  };
-};
-
-} // end namespace internal
-
-namespace numext {
-
-#if defined(EIGEN_GPU_COMPILE_PHASE)
-template<typename T> EIGEN_DEVICE_FUNC   void swap(T &a, T &b) { T tmp = b; b = a; a = tmp; }
-#else
-template<typename T> EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); }
-#endif
-
-#if defined(EIGEN_GPU_COMPILE_PHASE)
-using internal::device::numeric_limits;
-#else
-using std::numeric_limits;
-#endif
-
-// Integer division with rounding up.
-// T is assumed to be an integer type with a>=0, and b>0
-template<typename T>
-EIGEN_DEVICE_FUNC
-T div_ceil(const T &a, const T &b)
-{
-  return (a+b-1) / b;
-}
-
-// The aim of the following functions is to bypass -Wfloat-equal warnings
-// when we really want a strict equality comparison on floating points.
-template<typename X, typename Y> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
-bool equal_strict(const X& x,const Y& y) { return x == y; }
-
-#if !defined(EIGEN_GPU_COMPILE_PHASE) || (!defined(EIGEN_CUDA_ARCH) && defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC))
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
-bool equal_strict(const float& x,const float& y) { return std::equal_to<float>()(x,y); }
-
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
-bool equal_strict(const double& x,const double& y) { return std::equal_to<double>()(x,y); }
-#endif
-
-template<typename X, typename Y> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
-bool not_equal_strict(const X& x,const Y& y) { return x != y; }
-
-#if !defined(EIGEN_GPU_COMPILE_PHASE) || (!defined(EIGEN_CUDA_ARCH) && defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC))
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
-bool not_equal_strict(const float& x,const float& y) { return std::not_equal_to<float>()(x,y); }
-
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
-bool not_equal_strict(const double& x,const double& y) { return std::not_equal_to<double>()(x,y); }
-#endif
-
-/** \internal extract the bits of the float \a x */
-inline unsigned int as_uint(float x)
-{
-  unsigned int ret;
-  std::memcpy(&ret, &x, sizeof(float));
-  return ret;
-}
-
-} // end namespace numext
-
-} // end namespace Eigen
-
-// Define portable (u)int{32,64} types
-#if EIGEN_HAS_CXX11
-#include <cstdint>
-namespace Eigen {
-namespace numext {
-typedef std::uint8_t  uint8_t;
-typedef std::int8_t   int8_t;
-typedef std::uint16_t uint16_t;
-typedef std::int16_t  int16_t;
-typedef std::uint32_t uint32_t;
-typedef std::int32_t  int32_t;
-typedef std::uint64_t uint64_t;
-typedef std::int64_t  int64_t;
-}
-}
-#else
-// Without c++11, all compilers able to compile Eigen also
-// provides the C99 stdint.h header file.
-#include <stdint.h>
-namespace Eigen {
-namespace numext {
-typedef ::uint8_t  uint8_t;
-typedef ::int8_t   int8_t;
-typedef ::uint16_t uint16_t;
-typedef ::int16_t  int16_t;
-typedef ::uint32_t uint32_t;
-typedef ::int32_t  int32_t;
-typedef ::uint64_t uint64_t;
-typedef ::int64_t  int64_t;
-}
-}
-#endif
-
-#endif // EIGEN_META_H
-
-// clang-format on
--- a/patches/eigen/Tensor
+++ b/patches/eigen/Tensor
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-//#ifndef EIGEN_CXX11_TENSOR_MODULE
-//#define EIGEN_CXX11_TENSOR_MODULE
-
-#include "../../../Eigen/Core"
-
-#if EIGEN_HAS_CXX11
-
-#include "../SpecialFunctions"
-
-#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h"
-#include "src/util/CXX11Meta.h"
-#include "src/util/MaxSizeVector.h"
-
-/** \defgroup CXX11_Tensor_Module Tensor Module
-  *
-  * This module provides a Tensor class for storing arbitrarily indexed
-  * objects.
-  *
-  * \code
-  * #include <Eigen/CXX11/Tensor>
-  * \endcode
-  *
-  * Much of the documentation can be found \ref eigen_tensors "here".
-  */
-
-#include <cmath>
-#include <cstddef>
-#include <cstring>
-#include <random>
-
-#ifdef _WIN32
-typedef __int16 int16_t;
-typedef unsigned __int16 uint16_t;
-typedef __int32 int32_t;
-typedef unsigned __int32 uint32_t;
-typedef __int64 int64_t;
-typedef unsigned __int64 uint64_t;
-#include <windows.h>
-#else
-#include <stdint.h>
-#include <unistd.h>
-#endif
-
-#ifdef _WIN32
-#include <windows.h>
-#elif defined(__APPLE__)
-#include <mach/mach_time.h>
-#else
-#include <time.h>
-#endif
-
-#if defined(EIGEN_USE_THREADS) || defined(EIGEN_USE_SYCL)
-#include "ThreadPool"
-#endif
-
-#ifdef EIGEN_USE_GPU
-  #include <iostream>
-  #if defined(EIGEN_USE_HIP)
-    #include <hip/hip_runtime.h>
-  #else
-    #include <cuda_runtime.h>
-  #endif
-  #include <atomic>
-#endif
-
-#include "src/Tensor/TensorMacros.h"
-#include "src/Tensor/TensorForwardDeclarations.h"
-#include "src/Tensor/TensorMeta.h"
-#include "src/Tensor/TensorFunctors.h"
-#include "src/Tensor/TensorCostModel.h"
-#include "src/Tensor/TensorDeviceDefault.h"
-#include "src/Tensor/TensorDeviceThreadPool.h"
-#include "src/Tensor/TensorDeviceGpu.h"
-#ifndef gpu_assert
-#define gpu_assert(x)
-#endif
-#include "src/Tensor/TensorDeviceSycl.h"
-#include "src/Tensor/TensorIndexList.h"
-#include "src/Tensor/TensorDimensionList.h"
-#include "src/Tensor/TensorDimensions.h"
-#include "src/Tensor/TensorInitializer.h"
-#include "src/Tensor/TensorTraits.h"
-#include "src/Tensor/TensorRandom.h"
-#include "src/Tensor/TensorUInt128.h"
-#include "src/Tensor/TensorIntDiv.h"
-#include "src/Tensor/TensorGlobalFunctions.h"
-
-#include "src/Tensor/TensorBase.h"
-#include "src/Tensor/TensorBlock.h"
-
-#include "src/Tensor/TensorEvaluator.h"
-#include "src/Tensor/TensorExpr.h"
-#include "src/Tensor/TensorReduction.h"
-#include "src/Tensor/TensorReductionGpu.h"
-#include "src/Tensor/TensorArgMax.h"
-#include "src/Tensor/TensorConcatenation.h"
-#include "src/Tensor/TensorContractionMapper.h"
-#include "src/Tensor/TensorContractionBlocking.h"
-#include "src/Tensor/TensorContraction.h"
-#include "src/Tensor/TensorContractionThreadPool.h"
-#include "src/Tensor/TensorContractionGpu.h"
-#include "src/Tensor/TensorConversion.h"
-#include "src/Tensor/TensorConvolution.h"
-#include "src/Tensor/TensorFFT.h"
-#include "src/Tensor/TensorPatch.h"
-#include "src/Tensor/TensorImagePatch.h"
-#include "src/Tensor/TensorVolumePatch.h"
-#include "src/Tensor/TensorBroadcasting.h"
-#include "src/Tensor/TensorChipping.h"
-#include "src/Tensor/TensorInflation.h"
-#include "src/Tensor/TensorLayoutSwap.h"
-#include "src/Tensor/TensorMorphing.h"
-#include "src/Tensor/TensorPadding.h"
-#include "src/Tensor/TensorReverse.h"
-#include "src/Tensor/TensorShuffling.h"
-#include "src/Tensor/TensorStriding.h"
-#include "src/Tensor/TensorCustomOp.h"
-#include "src/Tensor/TensorEvalTo.h"
-#include "src/Tensor/TensorForcedEval.h"
-#include "src/Tensor/TensorGenerator.h"
-#include "src/Tensor/TensorAssign.h"
-#include "src/Tensor/TensorScan.h"
-#include "src/Tensor/TensorTrace.h"
-
-#ifdef EIGEN_USE_SYCL
-#include "src/Tensor/TensorReductionSycl.h"
-#include "src/Tensor/TensorConvolutionSycl.h"
-#include "src/Tensor/TensorContractionSycl.h"
-#include "src/Tensor/TensorScanSycl.h"
-#endif
-
-#include "src/Tensor/TensorExecutor.h"
-#include "src/Tensor/TensorDevice.h"
-
-#include "src/Tensor/TensorStorage.h"
-#include "src/Tensor/Tensor.h"
-#include "src/Tensor/TensorFixedSize.h"
-#include "src/Tensor/TensorMap.h"
-#include "src/Tensor/TensorRef.h"
-
-#include "src/Tensor/TensorIO.h"
-
-#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h"
-
-#endif  // EIGEN_HAS_CXX11
-//#endif // EIGEN_CXX11_TENSOR_MODULE
--- a/patches/eigen/TensorBlock.h
+++ b/patches/eigen/TensorBlock.h
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
-#define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
-
-namespace Eigen {
-namespace internal {
-
-// -------------------------------------------------------------------------- //
-// Forward declarations for templates defined below.
-template <typename Scalar, typename IndexType, int NumDims, int Layout>
-class TensorBlockIO;
-
-// -------------------------------------------------------------------------- //
-// Helper function to compute strides for densely stored buffer of given
-// dimensions.
-
-// TODO(ezhulenev): We compute strides 1000 times in different evaluators, use
-// this function instead everywhere.
-template <int Layout, typename IndexType, int NumDims>
-EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
-    const DSizes<IndexType, NumDims>& dimensions) {
-  DSizes<IndexType, NumDims> strides;
-  if (NumDims == 0) return strides;
-
-  // TODO(ezhulenev): Use templates to unroll this loop (similar to
-  // h_array_reduce in CXX11meta.h)? Benchmark it.
-  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-    strides[0] = 1;
-    for (int i = 1; i < NumDims; ++i) {
-      strides[i] = strides[i - 1] * dimensions[i - 1];
-    }
-  } else {
-    strides[NumDims - 1] = 1;
-    for (int i = NumDims - 2; i >= 0; --i) {
-      strides[i] = strides[i + 1] * dimensions[i + 1];
-    }
-  }
-
-  return strides;
-}
-
-template <int Layout, typename IndexType, size_t NumDims>
-EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
-    const Eigen::array<IndexType, NumDims>& dimensions) {
-  return strides<Layout>(DSizes<IndexType, NumDims>(dimensions));
-}
-
-template <int Layout, std::ptrdiff_t... Indices>
-EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t, sizeof...(Indices)> strides(
-    const Sizes<Indices...>& sizes) {
-  return strides<Layout>(DSizes<std::ptrdiff_t, sizeof...(Indices)>(sizes));
-}
-
-// -------------------------------------------------------------------------- //
-
-// Tensor block shape type defines what are the shape preference for the blocks
-// extracted from the larger tensor.
-//
-// Example: blocks of 100 elements from the large 100x100 tensor:
-// - tensor: 100x100
-// - target_block_size: 100
-//
-// TensorBlockShapeType:
-//  - kUniformAllDims: 100 blocks of size 10x10
-//  - kSkewedInnerDims: 100 blocks of size 100x1 (or 1x100 depending on a column
-//                      or row major layout)
-enum class TensorBlockShapeType { kUniformAllDims, kSkewedInnerDims };
-
-struct TensorBlockResourceRequirements {
-  TensorBlockShapeType shape_type;  // target block shape
-  size_t size;                      // target block size
-  TensorOpCost cost_per_coeff;      // cost of computing a single block element
-
-#ifdef EIGEN_HIPCC
-  // For HIPCC, we need to explicitly declare as a "device fun", the constructor
-  // which is implicitly invoked in the "merge" / "any" routines. else HIPCC
-  // errors out complaining about the lack of a matching constructor
-  EIGEN_DEVICE_FUNC
-  TensorBlockResourceRequirements(TensorBlockShapeType shape_type_, size_t size_,
-				  TensorOpCost cost_)
-    : shape_type(shape_type_), size(size_), cost_per_coeff(cost_)
-  {}
-#endif
-
-  template <typename Scalar>
-  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(
-      TensorBlockShapeType shape_type, size_t size_in_bytes,
-      TensorOpCost cost) {
-    const size_t size = numext::maxi(size_t(1), size_in_bytes / sizeof(Scalar));
-    return {shape_type, size, cost};
-  }
-
-  template <typename Scalar>
-  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(
-      TensorBlockShapeType shape_type, size_t size_in_bytes) {
-    // This default cost per coefficient is valid for most materialized tensor
-    // block evaluation implementations, because they typically just read
-    // coefficients from the underlying tensor storage, and write to the tensor
-    // block buffer (scratch or destination memory, reads and writes have linear
-    // access pattern). We ignore the fixed cost of block evaluation, because in
-    // practice it should negligible.
-    //
-    // Lazy block evaluation adds the cost of calling a functor for each
-    // coefficient.
-    //
-    // All non-trivial block evaluation implementations must provide their own
-    // cost approximation (e.g. shuffling inner dimension has a much higher cost
-    // because it reads memory randomly, although the total number of moved
-    // bytes is the same).
-    return withShapeAndSize<Scalar>(shape_type, size_in_bytes,
-                                    {/*bytes_loaded=*/sizeof(Scalar),
-                                     /*bytes_stored=*/sizeof(Scalar),
-                                     /*compute_cycles=*/0});
-  }
-
-  template <typename Scalar>
-  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements skewed(
-      size_t size_in_bytes) {
-    return withShapeAndSize<Scalar>(TensorBlockShapeType::kSkewedInnerDims,
-                                    size_in_bytes);
-  }
-
-  template <typename Scalar>
-  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements uniform(
-      size_t size_in_bytes) {
-    return withShapeAndSize<Scalar>(TensorBlockShapeType::kUniformAllDims,
-                                    size_in_bytes);
-  }
-
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE TensorBlockResourceRequirements
-  merge(const TensorBlockResourceRequirements& lhs,
-        const TensorBlockResourceRequirements& rhs) {
-    return {merge(lhs.shape_type, rhs.shape_type),           // shape_type
-            merge(lhs.size, rhs.size),                       // size
-            merge(lhs.cost_per_coeff, rhs.cost_per_coeff)};  // cost_per_coeff
-  }
-
-  EIGEN_DEVICE_FUNC TensorBlockResourceRequirements& addCostPerCoeff(
-      TensorOpCost cost) {
-    cost_per_coeff += cost;
-    return *this;
-  }
-
-  // This is a resource requirement that should be returned from expressions
-  // that do not have any block evaluation preference (e.g. default tensor
-  // expression with raw buffer access).
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE TensorBlockResourceRequirements any() {
-    return {TensorBlockShapeType::kUniformAllDims, 1, {0, 0, 0}};
-  }
-
- private:
-  using Requirements = TensorBlockResourceRequirements;
-
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE size_t merge(size_t lhs_size, size_t rhs_size) {
-    return numext::maxi(lhs_size, rhs_size);
-  }
-
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE TensorBlockShapeType
-  merge(TensorBlockShapeType lhs, TensorBlockShapeType rhs) {
-    return (lhs == TensorBlockShapeType::kSkewedInnerDims ||
-            rhs == TensorBlockShapeType::kSkewedInnerDims)
-               ? TensorBlockShapeType::kSkewedInnerDims
-               : TensorBlockShapeType::kUniformAllDims;
-  }
-
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE TensorOpCost merge(TensorOpCost lhs_cost,
-                                                TensorOpCost rhs_cost) {
-    return lhs_cost + rhs_cost;
-  }
-};
-
-// -------------------------------------------------------------------------- //
-// TensorBlockDescriptor specifies a block offset within a tensor and the block
-// sizes along each of the tensor dimensions.
-
-template <int NumDims, typename IndexType = Eigen::Index>
-class TensorBlockDescriptor {
- public:
-  typedef DSizes<IndexType, NumDims> Dimensions;
-
-  // If we evaluate a Tensor assignment, and expression on the left, already has
-  // a memory buffer, then we might do performance optimization, and evaluate
-  // the root expression directly into the final output memory. Some time it's
-  // possible to reuse it for materializing subexpressions inside an expression
-  // tree, to to avoid dynamic memory allocation.
-  //
-  // The pointer type of the underlying storage is erased, because passing
-  // Scalar type through all the expression evaluation layers is way too many
-  // templates. In practice destination buffer type should always match the
-  // evaluated expression scalar type.
-  class DestinationBuffer {
-   public:
-    enum DestinationBufferKind : int {
-      // The above explicit specification of "int" as the enum basetype is
-      // needed to get around a HIPCC link error ("the field type is not
-      // amp-compatible")
-      // which is issued for class members with the enum type.
-      // TODO(rocm):
-      // remove the "int" basetype once HIPCC has been fixed to not error out
-      // in the above scenario.
-
-      // Destination buffer is not defined (`m_data` == nullptr).
-      kEmpty,
-
-      // Tensor block defined by an owning tensor block descriptor can fit
-      // contiguously into the destination buffer. In this case it's safe to
-      // materialize tensor block in the destination buffer, wrap it in a
-      // TensorMap, and use to build Eigen expression on top of it.
-      kContiguous,
-
-      // Destination buffer strides do not match strides of the contiguously
-      // stored block, and it's impossible to define a TensorMap over this
-      // buffer. However if we are evaluating a root of an expression tree, we
-      // still can materialize an output into this destination, because we can
-      // guarantee that no one will ever access it through block API.
-      //
-      // In theory it is possible to build valid TensorStriding<TensorMap>
-      // expression on top of this destination buffer, however it has
-      // inefficient coeff/packet access, and defeats the purpose of fast block
-      // evaluation API.
-      kStrided
-    };
-
-    template <typename Scalar>
-    Scalar* data() const {
-      eigen_assert(m_data_type_size == sizeof(Scalar));
-      return static_cast<Scalar*>(m_data);
-    }
-
-    const Dimensions& strides() const { return m_strides; }
-    const DestinationBufferKind& kind() const { return m_kind; }
-
-   private:
-    friend class TensorBlockDescriptor;
-
-    DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {}
-
-    template <typename Scalar>
-    DestinationBuffer(Scalar* data, const Dimensions& strides,
-                      DestinationBufferKind kind)
-        : m_data(static_cast<void*>(data)),
-          m_data_type_size(sizeof(Scalar)),
-          m_strides(strides),
-          m_kind(kind) {}
-
-    template <int Layout, typename Scalar>
-    static DestinationBuffer make(const TensorBlockDescriptor& desc,
-                                  Scalar* data, const Dimensions& strides) {
-      return DestinationBuffer(data, strides, kind<Layout>(desc, strides));
-    }
-
-    template <int Layout>
-    static DestinationBufferKind kind(const TensorBlockDescriptor& desc,
-                                      const Dimensions& strides) {
-      const Dimensions& desc_dims = desc.dimensions();
-      const Dimensions& desc_strides = internal::strides<Layout>(desc_dims);
-      for (int i = 0; i < NumDims; ++i) {
-        if (desc_dims[i] == 1) continue;
-        if (desc_strides[i] != strides[i]) return kStrided;
-      }
-      return kContiguous;
-    }
-
-    // Storage pointer is type erased, to reduce template bloat, but we still
-    // keep the size of the underlying element type for error checking.
-    void* m_data;
-    size_t m_data_type_size;
-
-    // Destination buffer dimensions always match the dimensions of a tensor
-    // block descriptor it belongs to, however strides might be different.
-    Dimensions m_strides;
-
-    DestinationBufferKind m_kind;
-  };
-
-  TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions,
-                        const DestinationBuffer& destination)
-      : m_offset(offset),
-        m_dimensions(dimensions),
-        m_destination(destination) {}
-
-  TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions)
-      : m_offset(offset),
-        m_dimensions(dimensions),
-        m_destination(DestinationBuffer()) {}
-
-  IndexType offset() const { return m_offset; }
-  const Dimensions& dimensions() const { return m_dimensions; }
-  IndexType dimension(int index) const { return m_dimensions[index]; }
-  IndexType size() const { return array_prod<IndexType>(m_dimensions); }
-
-  const DestinationBuffer& destination() const { return m_destination; }
-
-  template <int Layout, typename Scalar>
-  void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides) {
-    eigen_assert(dst_base != NULL);
-    m_destination =
-        DestinationBuffer::template make<Layout>(*this, dst_base, dst_strides);
-  }
-
-  template <int Layout, typename Scalar, typename DstStridesIndexType>
-  void AddDestinationBuffer(
-      Scalar* dst_base,
-      const DSizes<DstStridesIndexType, NumDims>& dst_strides) {
-    // DSizes constructor will do index type promotion if it's safe.
-    AddDestinationBuffer<Layout>(dst_base, Dimensions(dst_strides));
-  }
-
-  TensorBlockDescriptor& DropDestinationBuffer() {
-    m_destination.m_data = NULL;
-    m_destination.m_kind = DestinationBuffer::kEmpty;
-    return *this;
-  }
-
-  bool HasDestinationBuffer() const {
-    return m_destination.kind() != DestinationBuffer::kEmpty;
-  }
-
-  // Returns a copy of `*this` with updated offset.
-  TensorBlockDescriptor WithOffset(IndexType offset) const {
-    return TensorBlockDescriptor(offset, m_dimensions, m_destination);
-  }
-
- private:
-  // Offset and dimensions are immutable after construction. Block descriptor
-  // can only be mutated by adding or dropping destination.
-  const IndexType m_offset;
-  const Dimensions m_dimensions;
-  DestinationBuffer m_destination;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorBlockMapper is responsible for iterating over the blocks of a tensor.
-
-template <int NumDims, int Layout, typename IndexType = Eigen::Index>
-class TensorBlockMapper {
-  typedef TensorBlockDescriptor<NumDims, IndexType> BlockDescriptor;
-
- public:
-  typedef DSizes<IndexType, NumDims> Dimensions;
-
-  TensorBlockMapper() = default;
-  TensorBlockMapper(const DSizes<IndexType, NumDims>& dimensions,
-                    const TensorBlockResourceRequirements& requirements)
-      : m_tensor_dimensions(dimensions), m_requirements(requirements) {
-    // Compute block dimensions and the total number of blocks.
-    InitializeBlockDimensions();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const {
-    return m_total_block_count;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize() const {
-    return m_block_dimensions.TotalSize();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes<IndexType, NumDims>&
-  blockDimensions() const {
-    return m_block_dimensions;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor
-  blockDescriptor(IndexType block_index) const {
-    static const bool isColMajor = Layout == static_cast<int>(ColMajor);
-
-    IndexType offset = 0;
-    DSizes<IndexType, NumDims> dimensions;
-
-    if (NumDims == 0) return BlockDescriptor(offset, dimensions);
-
-    // Iterate outer -> inner dimensions.
-    for (int i = NumDims - 1; i >= 0; --i) {
-      const int dim = isColMajor ? i : NumDims - i - 1;
-
-      const IndexType idx = block_index / m_block_strides[dim];
-      block_index -= idx * m_block_strides[dim];
-
-      const IndexType coord = idx * m_block_dimensions[dim];
-      dimensions[dim] = numext::mini(m_tensor_dimensions[dim] - coord,
-                                     m_block_dimensions[dim]);
-      offset += coord * m_tensor_strides[dim];
-    }
-
-    return {offset, dimensions};
-  }
-
- private:
-  void InitializeBlockDimensions() {
-    // Requested block shape and size.
-    const TensorBlockShapeType shape_type = m_requirements.shape_type;
-    IndexType target_block_size =
-        numext::maxi<IndexType>(1, static_cast<IndexType>(m_requirements.size));
-
-    IndexType tensor_size = m_tensor_dimensions.TotalSize();
-
-    // Corner case: one of the dimensions is zero. Logic below is too complex
-    // to handle this case on a general basis, just use unit block size.
-    // Note: we must not yield blocks with zero dimensions (recipe for
-    // overflows/underflows, divisions by zero and NaNs later).
-    if (tensor_size == 0) {
-      for (int i = 0; i < NumDims; ++i) {
-        m_block_dimensions[i] = 1;
-      }
-      m_total_block_count = 0;
-      return;
-    }
-
-    // If tensor fits into a target block size, evaluate it as a single block.
-    if (tensor_size <= target_block_size) {
-      m_block_dimensions = m_tensor_dimensions;
-      m_total_block_count = 1;
-      // The only valid block index is `0`, and in this case we do not need
-      // to compute real strides for tensor or blocks (see blockDescriptor).
-      for (int i = 0; i < NumDims; ++i) {
-        m_tensor_strides[i] = 0;
-        m_block_strides[i] = 1;
-      }
-      return;
-    }
-
-    static const bool isColMajor = Layout == static_cast<int>(ColMajor);
-
-    // Block shape skewed towards inner dimension.
-    if (shape_type == TensorBlockShapeType::kSkewedInnerDims) {
-      IndexType coeff_to_allocate = target_block_size;
-
-      for (int i = 0; i < NumDims; ++i) {
-        const int dim = isColMajor ? i : NumDims - i - 1;
-        m_block_dimensions[dim] =
-            numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]);
-        coeff_to_allocate = divup(
-            coeff_to_allocate,
-            numext::maxi(static_cast<IndexType>(1), m_block_dimensions[dim]));
-      }
-      eigen_assert(coeff_to_allocate == 1);
-
-    } else if (shape_type == TensorBlockShapeType::kUniformAllDims) {
-      // Tensor will not fit within 'target_block_size' budget: calculate tensor
-      // block dimension sizes based on "square" dimension size target.
-      const IndexType dim_size_target = convert_index<IndexType>(
-          std::pow(static_cast<float>(target_block_size),
-                   1.0f / static_cast<float>(m_block_dimensions.rank())));
-
-      for (int i = 0; i < NumDims; ++i) {
-        // TODO(andydavis) Adjust the inner most 'block_dim_size' to make it
-        // a multiple of the packet size. Note that reducing
-        // 'block_dim_size' in this manner can increase the number of
-        // blocks, and so will amplify any per-block overhead.
-        m_block_dimensions[i] =
-            numext::mini(dim_size_target, m_tensor_dimensions[i]);
-      }
-
-      // Add any un-allocated coefficients to inner dimension(s).
-      IndexType total_size = m_block_dimensions.TotalSize();
-      for (int i = 0; i < NumDims; ++i) {
-        const int dim = isColMajor ? i : NumDims - i - 1;
-
-        if (m_block_dimensions[dim] < m_tensor_dimensions[dim]) {
-          const IndexType total_size_other_dims =
-              total_size / m_block_dimensions[dim];
-          const IndexType alloc_avail =
-              divup<IndexType>(target_block_size, total_size_other_dims);
-          if (alloc_avail == m_block_dimensions[dim]) {
-            // Insufficient excess coefficients to allocate.
-            break;
-          }
-          m_block_dimensions[dim] =
-              numext::mini(m_tensor_dimensions[dim], alloc_avail);
-          total_size = total_size_other_dims * m_block_dimensions[dim];
-        }
-      }
-
-    } else {
-      eigen_assert(false);  // unknown block shape
-    }
-
-    eigen_assert(m_block_dimensions.TotalSize() >=
-                 numext::mini<IndexType>(target_block_size,
-                                         m_tensor_dimensions.TotalSize()));
-
-    // Calculate block counts by dimension and total block count.
-    DSizes<IndexType, NumDims> block_count;
-    for (int i = 0; i < NumDims; ++i) {
-      block_count[i] = divup(m_tensor_dimensions[i], m_block_dimensions[i]);
-    }
-    m_total_block_count = array_prod(block_count);
-
-    // Calculate block strides (used for enumerating blocks).
-    m_tensor_strides = strides<Layout>(m_tensor_dimensions);
-    m_block_strides = strides<Layout>(block_count);
-  }
-
-  DSizes<IndexType, NumDims> m_tensor_dimensions;
-  TensorBlockResourceRequirements m_requirements;
-
-  DSizes<IndexType, NumDims> m_block_dimensions;
-  IndexType m_total_block_count;
-
-  DSizes<IndexType, NumDims> m_tensor_strides;
-  DSizes<IndexType, NumDims> m_block_strides;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorBlockScratchAllocator is responsible for allocating temporary buffers
-// for block evaluation (output or input block materialization). Given that
-// Eigen expression traversal order is deterministic, all temporary allocations
-// are happening in the same order, and usually have exactly the same size.
-// Scratch allocator keeps a trace of all dynamic allocations, and after the
-// first block evaluation is completed, we should be able to reuse all the
-// temporary buffers for the next block evaluation.
-
-template <typename Device>
-class TensorBlockScratchAllocator {
- public:
-  explicit TensorBlockScratchAllocator(const Device& device)
-      : m_device(device), m_allocation_index(0) {}
-
-  ~TensorBlockScratchAllocator() {
-    for (size_t i = 0; i < m_allocations.size(); ++i) {
-      m_device.deallocate(m_allocations[i].ptr);
-    }
-  }
-
-  void* allocate(size_t size) {
-    // TODO(ezhulenev): Remove when replaced with inlined vector.
-    if (m_allocations.capacity() == 0) m_allocations.reserve(8);
-
-    // Check if we already have an existing allocation att current index.
-    const int num_allocations = static_cast<int>(m_allocations.size());
-    const bool has_allocation = m_allocation_index < num_allocations;
-
-    // Allocation index can't be larger than the number of allocations.
-    eigen_assert(m_allocation_index <= num_allocations);
-
-    // If we have existing allocation, and its size is larger or equal to
-    // requested size, we do nothing.
-
-    // If current allocation can't fit requested size, we deallocate it, and
-    // replace with a larger allocation.
-    if (has_allocation && m_allocations[m_allocation_index].size < size) {
-      m_device.deallocate(m_allocations[m_allocation_index].ptr);
-      m_allocations[m_allocation_index].ptr = m_device.allocate(size);
-      m_allocations[m_allocation_index].size = size;
-    }
-
-    // Make a new allocation if we don't have and existing one.
-    if (!has_allocation) {
-      Allocation allocation;
-      allocation.ptr = m_device.allocate(size);
-      allocation.size = size;
-      m_allocations.push_back(allocation);
-    }
-
-    eigen_assert(m_allocations[m_allocation_index].ptr != NULL);
-    eigen_assert(m_allocations[m_allocation_index].size >= size);
-
-    return m_allocations[m_allocation_index++].ptr;
-  }
-
-  void reset() { m_allocation_index = 0; }
-
- private:
-  struct Allocation {
-    void* ptr;
-    size_t size;
-  };
-
-  const Device& m_device;
-  int m_allocation_index;
-  // TODO(ezhulenev): This should be an inlined vector.
-  std::vector<Allocation> m_allocations;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorBlockKind represents all possible block kinds, that can be produced by
-// TensorEvaluator::evalBlock function.
-enum TensorBlockKind {
-  // Tensor block that is a lazy expression that must be assigned to a
-  // destination using TensorBlockAssign.
-  kExpr,
-
-  // Tensor block that is a view into a memory buffer owned by an underlying
-  // Tensor expression (e.g. it can be a view into a Tensor buffer).
-  kView,
-
-  // Tensor block that was materialized in a scratch memory buffer, allocated
-  // with TensorBlockScratchAllocator. This block must be copied to a
-  // destination, similar to a block of `kExpr` type.
-  kMaterializedInScratch,
-
-  // Tensor block that was materialized directly into the final output memory
-  // buffer. For example if the left side of an assignment is a Tensor, we can
-  // directly materialize the block in the destination memory.
-  //
-  // If strides in the output buffer do not match tensor block strides, the
-  // Tensor expression will be invalid, and should not be used by
-  // TensorBlockAssign or for constructing another block expression.
-  kMaterializedInOutput
-};
-
-// -------------------------------------------------------------------------- //
-// TensorBlockNotImplemented should be used to defined TensorBlock typedef in
-// TensorEvaluators that do not support block evaluation.
-
-class TensorBlockNotImplemented {
- public:
-  typedef void XprType;
-};
-
-// -------------------------------------------------------------------------- //
-// XprScalar extracts Scalar type from the Eigen expressions (if expression type
-// is not void). It's required to be able to define lazy block expression for
-// argument types, that do not support block evaluation.
-
-template <typename XprType>
-struct XprScalar {
-  typedef typename XprType::Scalar type;
-};
-template <>
-struct XprScalar<void> {
-  typedef void type;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorMaterializedBlock is a fully evaluated block of the original tensor,
-// and XprType is just a TensorMap over the data. This block type is typically
-// used to materialize blocks of tensor expressions, that can't be efficiently
-// represented as lazy Tensor expressions with fast coeff/packet operations,
-// e.g. we materialize all broadcasts into evaluated blocks.
-//
-// TensorMaterializedBlock does not own its memory buffer, it's either a memory
-// buffer that backs the original expression (e.g. block is just a view into a
-// Tensor), or a memory buffer allocated with scratch allocator, and in this
-// case the scratch allocator will deallocate it at the end of block based
-// expression execution.
-//
-// If the block was evaluated directly into the output buffer, and strides in
-// the output buffer do not match block strides, the TensorMap expression will
-// be invalid, and should never be used in block assignment or any other tensor
-// expression.
-
-template <typename Scalar, int NumDims, int Layout,
-          typename IndexType = Eigen::Index>
-class TensorMaterializedBlock {
- public:
-  typedef DSizes<IndexType, NumDims> Dimensions;
-  typedef TensorMap<const Tensor<Scalar, NumDims, Layout> > XprType;
-
-  TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data,
-                          const Dimensions& dimensions, bool valid_expr = true)
-      : m_kind(kind),
-        m_data(data),
-        m_dimensions(dimensions),
-        m_expr(m_data, m_dimensions),
-        m_valid_expr(valid_expr) {
-    eigen_assert(m_kind == internal::TensorBlockKind::kView ||
-                 m_kind == internal::TensorBlockKind::kMaterializedInScratch ||
-                 m_kind == internal::TensorBlockKind::kMaterializedInOutput);
-  }
-
-  TensorBlockKind kind() const { return m_kind; }
-  // NOTE(ezhulenev): Returning XprType by value like in other block types
-  // causes asan failures. The theory is that XprType::Nested doesn't work
-  // properly for TensorMap.
-  const XprType& expr() const {
-    eigen_assert(m_valid_expr);
-    return m_expr;
-  }
-  const Scalar* data() const { return m_data; }
-  void cleanup() {}
-
-  typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
-
-  // TensorMaterializedBlock can be backed by different types of storage:
-  //
-  //   (1) Contiguous block of memory allocated with scratch allocator.
-  //   (2) Contiguous block of memory reused from tensor block descriptor
-  //       destination buffer.
-  //   (3) Strided block of memory reused from tensor block descriptor
-  //       destination buffer.
-  //
-  class Storage {
-   public:
-    Scalar* data() const { return m_data; }
-    const Dimensions& dimensions() const { return m_dimensions; }
-    const Dimensions& strides() const { return m_strides; }
-
-    TensorMaterializedBlock AsTensorMaterializedBlock() const {
-      return TensorMaterializedBlock(
-          m_materialized_in_output
-              ? internal::TensorBlockKind::kMaterializedInOutput
-              : internal::TensorBlockKind::kMaterializedInScratch,
-          m_data, m_dimensions, !m_strided_storage);
-    }
-
-   private:
-    friend class TensorMaterializedBlock;
-
-    Storage(Scalar* data, const Dimensions& dimensions,
-            const Dimensions& strides, bool materialized_in_output,
-            bool strided_storage)
-        : m_data(data),
-          m_dimensions(dimensions),
-          m_strides(strides),
-          m_materialized_in_output(materialized_in_output),
-          m_strided_storage(strided_storage) {}
-
-    Scalar* m_data;
-    Dimensions m_dimensions;
-    Dimensions m_strides;
-    bool m_materialized_in_output;
-    bool m_strided_storage;
-  };
-
-  // Creates a storage for materialized block either from the block descriptor
-  // destination buffer, or allocates a new buffer with scratch allocator.
-  template <typename TensorBlockScratch>
-  EIGEN_STRONG_INLINE static Storage prepareStorage(
-      TensorBlockDesc& desc, TensorBlockScratch& scratch,
-      bool allow_strided_storage = false) {
-    // Try to reuse destination as an output block buffer.
-    typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer;
-
-    if (desc.destination().kind() == DestinationBuffer::kContiguous) {
-      Scalar* buffer = desc.destination().template data<Scalar>();
-      desc.DropDestinationBuffer();
-      return Storage(buffer, desc.dimensions(),
-                     internal::strides<Layout>(desc.dimensions()),
-                     /*materialized_in_output=*/true,
-                     /*strided_storage=*/false);
-
-    } else if (desc.destination().kind() == DestinationBuffer::kStrided &&
-               allow_strided_storage) {
-      Scalar* buffer = desc.destination().template data<Scalar>();
-      desc.DropDestinationBuffer();
-      return Storage(buffer, desc.dimensions(), desc.destination().strides(),
-                     /*materialized_in_output=*/true, /*strided_storage=*/true);
-
-    } else {
-      void* mem = scratch.allocate(desc.size() * sizeof(Scalar));
-      return Storage(static_cast<Scalar*>(mem), desc.dimensions(),
-                     internal::strides<Layout>(desc.dimensions()),
-                     /*materialized_in_output=*/false,
-                     /*strided_storage=*/false);
-    }
-  }
-
-  // Creates a materialized block for the given descriptor from a memory buffer.
-  template <typename DataDimensions, typename TensorBlockScratch>
-  EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize(
-      const Scalar* data, const DataDimensions& data_dims,
-      TensorBlockDesc& desc, TensorBlockScratch& scratch) {
-    eigen_assert(array_size<DataDimensions>::value == desc.dimensions().size());
-
-    // If a tensor block dimensions covers a contiguous block of the underlying
-    // memory, we can skip block buffer memory allocation, and construct a block
-    // from existing `data` memory buffer.
-    //
-    // Example: (RowMajor layout)
-    //   data_dims:          [11, 12, 13, 14]
-    //   desc.dimensions():  [1,   1,  3, 14]
-    //
-    // In this case we can construct a TensorBlock starting at
-    // `data + desc.offset()`, with a `desc.dimensions()` block sizes.
-    static const bool is_col_major = Layout == ColMajor;
-
-    // Find out how many inner dimensions have a matching size.
-    int num_matching_inner_dims = 0;
-    for (int i = 0; i < NumDims; ++i) {
-      int dim = is_col_major ? i : NumDims - i - 1;
-      if (data_dims[dim] != desc.dimensions()[dim]) break;
-      ++num_matching_inner_dims;
-    }
-
-    // All the outer dimensions must be of size `1`, except a single dimension
-    // before the matching inner dimension (`3` in the example above).
-    bool can_use_direct_access = true;
-    for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) {
-      int dim = is_col_major ? i : NumDims - i - 1;
-      if (desc.dimension(dim) != 1) {
-        can_use_direct_access = false;
-        break;
-      }
-    }
-
-    if (can_use_direct_access) {
-      const Scalar* block_start = data + desc.offset();
-      return TensorMaterializedBlock(internal::TensorBlockKind::kView,
-                                     block_start, desc.dimensions());
-
-    } else {
-      // Reuse destination buffer or allocate new buffer with scratch allocator.
-      const Storage storage = prepareStorage(desc, scratch);
-
-      typedef internal::TensorBlockIO<Scalar, IndexType, NumDims, Layout>
-          TensorBlockIO;
-      typedef typename TensorBlockIO::Dst TensorBlockIODst;
-      typedef typename TensorBlockIO::Src TensorBlockIOSrc;
-
-      TensorBlockIOSrc src(internal::strides<Layout>(Dimensions(data_dims)),
-                           data, desc.offset());
-      TensorBlockIODst dst(storage.dimensions(), storage.strides(),
-                           storage.data());
-
-      TensorBlockIO::Copy(dst, src);
-      return storage.AsTensorMaterializedBlock();
-    }
-  }
-
- private:
-  TensorBlockKind m_kind;
-  const Scalar* m_data;
-  Dimensions m_dimensions;
-  XprType m_expr;
-  bool m_valid_expr;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp
-// functor to the blocks produced by the underlying Tensor expression.
-
-template <typename UnaryOp, typename ArgTensorBlock>
-class TensorCwiseUnaryBlock {
-  static const bool NoArgBlockAccess =
-      internal::is_void<typename ArgTensorBlock::XprType>::value;
-
- public:
-  typedef typename conditional<
-      NoArgBlockAccess, void,
-      TensorCwiseUnaryOp<UnaryOp, const typename ArgTensorBlock::XprType> >::
-      type XprType;
-
-  typedef typename XprScalar<XprType>::type Scalar;
-
-  TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor)
-      : m_arg_block(arg_block), m_functor(functor) {}
-
-  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
-
-  XprType expr() const { return XprType(m_arg_block.expr(), m_functor); }
-  const Scalar* data() const { return NULL; }
-  void cleanup() { m_arg_block.cleanup(); }
-
- private:
-  ArgTensorBlock m_arg_block;
-  UnaryOp m_functor;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp
-// functor to the blocks produced by the underlying Tensor expression.
-
-template <typename BinaryOp, typename LhsTensorBlock, typename RhsTensorBlock>
-class TensorCwiseBinaryBlock {
-  static const bool NoArgBlockAccess =
-      internal::is_void<typename LhsTensorBlock::XprType>::value ||
-      internal::is_void<typename RhsTensorBlock::XprType>::value;
-
- public:
-  typedef typename conditional<
-      NoArgBlockAccess, void,
-      TensorCwiseBinaryOp<BinaryOp, const typename LhsTensorBlock::XprType,
-                          const typename RhsTensorBlock::XprType> >::type
-      XprType;
-
-  typedef typename XprScalar<XprType>::type Scalar;
-
-  TensorCwiseBinaryBlock(const LhsTensorBlock& left_block,
-                         const RhsTensorBlock& right_block,
-                         const BinaryOp& functor)
-      : m_left_block(left_block),
-        m_right_block(right_block),
-        m_functor(functor) {}
-
-  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
-
-  XprType expr() const {
-    return XprType(m_left_block.expr(), m_right_block.expr(), m_functor);
-  }
-
-  const Scalar* data() const { return NULL; }
-
-  void cleanup() {
-    m_left_block.cleanup();
-    m_right_block.cleanup();
-  }
-
- private:
-  LhsTensorBlock m_left_block;
-  RhsTensorBlock m_right_block;
-  BinaryOp m_functor;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorUnaryExprBlock is a lazy tensor expression block that can construct
-// an arbitrary tensor expression from a block of the underlying type (this is a
-// generalization of the TensorCwiseUnaryBlock for arbitrary expressions).
-
-template <typename BlockFactory, typename ArgTensorBlock>
-class TensorUnaryExprBlock {
-  typedef typename ArgTensorBlock::XprType ArgXprType;
-  static const bool NoArgBlockAccess = internal::is_void<ArgXprType>::value;
-
- public:
-  typedef typename conditional<
-      NoArgBlockAccess, void,
-      typename BlockFactory::template XprType<ArgXprType>::type>::type XprType;
-
-  typedef typename XprScalar<XprType>::type Scalar;
-
-  TensorUnaryExprBlock(const ArgTensorBlock& arg_block,
-                       const BlockFactory& factory)
-      : m_arg_block(arg_block), m_factory(factory) {}
-
-  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
-  XprType expr() const { return m_factory.expr(m_arg_block.expr()); }
-  const Scalar* data() const { return NULL; }
-  void cleanup() { m_arg_block.cleanup(); }
-
- private:
-  ArgTensorBlock m_arg_block;
-  BlockFactory m_factory;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorTernaryExprBlock is a lazy tensor expression block that can construct
-// an arbitrary tensor expression from three blocks of the underlying type.
-
-template <typename BlockFactory, typename Arg1TensorBlock,
-          typename Arg2TensorBlock, typename Arg3TensorBlock>
-class TensorTernaryExprBlock {
-  typedef typename Arg1TensorBlock::XprType Arg1XprType;
-  typedef typename Arg2TensorBlock::XprType Arg2XprType;
-  typedef typename Arg3TensorBlock::XprType Arg3XprType;
-
-  static const bool NoArgBlockAccess = internal::is_void<Arg1XprType>::value ||
-                                       internal::is_void<Arg2XprType>::value ||
-                                       internal::is_void<Arg3XprType>::value;
-
- public:
-  typedef typename conditional<
-      NoArgBlockAccess, void,
-      typename BlockFactory::template XprType<Arg1XprType, Arg2XprType,
-                                              Arg3XprType>::type>::type XprType;
-
-  typedef typename XprScalar<XprType>::type Scalar;
-
-  TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block,
-                         const Arg2TensorBlock& arg2_block,
-                         const Arg3TensorBlock& arg3_block,
-                         const BlockFactory& factory)
-      : m_arg1_block(arg1_block),
-        m_arg2_block(arg2_block),
-        m_arg3_block(arg3_block),
-        m_factory(factory) {}
-
-  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
-  XprType expr() const {
-    return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(),
-                          m_arg3_block.expr());
-  }
-  const Scalar* data() const { return NULL; }
-  void cleanup() {
-    m_arg1_block.cleanup();
-    m_arg2_block.cleanup();
-    m_arg3_block.cleanup();
-  }
-
- private:
-  Arg1TensorBlock m_arg1_block;
-  Arg2TensorBlock m_arg2_block;
-  Arg3TensorBlock m_arg3_block;
-  BlockFactory m_factory;
-};
-
-// -------------------------------------------------------------------------- //
-// StridedLinearBufferCopy provides a method to copy data between two linear
-// buffers with different strides, with optimized paths for scatter/gather.
-
-template <typename Scalar, typename IndexType>
-class StridedLinearBufferCopy {
-  typedef typename packet_traits<Scalar>::type Packet;
-  enum {
-    Vectorizable = packet_traits<Scalar>::Vectorizable,
-    PacketSize = packet_traits<Scalar>::size
-  };
-
- public:
-  // Specifying linear copy kind statically gives ~30% speedup for small sizes.
-  enum class Kind {
-    Linear = 0,       // src_stride == 1 && dst_stride == 1
-    Scatter = 1,      // src_stride == 1 && dst_stride != 1
-    FillLinear = 2,   // src_stride == 0 && dst_stride == 1
-    FillScatter = 3,  // src_stride == 0 && dst_stride != 1
-    Gather = 4,       // dst_stride == 1
-    Random = 5        // everything else
-  };
-
-  struct Dst {
-    Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {}
-
-    IndexType offset;
-    IndexType stride;
-    Scalar* data;
-  };
-
-  struct Src {
-    Src(IndexType o, IndexType s, const Scalar* d)
-        : offset(o), stride(s), data(d) {}
-
-    IndexType offset;
-    IndexType stride;
-    const Scalar* data;
-  };
-
-  template <typename StridedLinearBufferCopy::Kind kind>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst,
-                                                        const Src& src,
-                                                        const size_t count) {
-    Run<kind>(count, dst.offset, dst.stride, dst.data, src.offset, src.stride,
-              src.data);
-  }
-
- private:
-  template <typename StridedLinearBufferCopy::Kind kind>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      const IndexType count, const IndexType dst_offset,
-      const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data,
-      const IndexType src_offset, const IndexType src_stride,
-      const Scalar* EIGEN_RESTRICT src_data) {
-    const Scalar* src = &src_data[src_offset];
-    Scalar* dst = &dst_data[dst_offset];
-
-    if (!Vectorizable) {
-      for (Index i = 0; i < count; ++i) {
-        dst[i * dst_stride] = src[i * src_stride];
-      }
-      return;
-    }
-
-    const IndexType vectorized_size = count - PacketSize;
-    IndexType i = 0;
-
-    if (kind == StridedLinearBufferCopy::Kind::Linear) {
-      // ******************************************************************** //
-      // Linear copy from `src` to `dst`.
-      const IndexType unrolled_size = count - 4 * PacketSize;
-      eigen_assert(src_stride == 1 && dst_stride == 1);
-      for (; i <= unrolled_size; i += 4 * PacketSize) {
-        for (int j = 0; j < 4; ++j) {
-          Packet p = ploadu<Packet>(src + i + j * PacketSize);
-          pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
-        }
-      }
-      for (; i <= vectorized_size; i += PacketSize) {
-        Packet p = ploadu<Packet>(src + i);
-        pstoreu<Scalar, Packet>(dst + i, p);
-      }
-      for (; i < count; ++i) {
-        dst[i] = src[i];
-      }
-      // ******************************************************************** //
-    } else if (kind == StridedLinearBufferCopy::Kind::Scatter) {
-      // Scatter from `src` to `dst`.
-      eigen_assert(src_stride == 1 && dst_stride != 1);
-      for (; i <= vectorized_size; i += PacketSize) {
-        Packet p = ploadu<Packet>(src + i);
-        pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
-      }
-      for (; i < count; ++i) {
-        dst[i * dst_stride] = src[i];
-      }
-      // ******************************************************************** //
-    } else if (kind == StridedLinearBufferCopy::Kind::FillLinear) {
-      // Fill `dst` with value at `*src`.
-      eigen_assert(src_stride == 0 && dst_stride == 1);
-      const IndexType unrolled_size = count - 4 * PacketSize;
-      Packet p = pload1<Packet>(src);
-      for (; i <= unrolled_size; i += 4 * PacketSize) {
-        for (int j = 0; j < 4; ++j) {
-          pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
-        }
-      }
-      for (; i <= vectorized_size; i += PacketSize) {
-        pstoreu<Scalar, Packet>(dst + i, p);
-      }
-      for (; i < count; ++i) {
-        dst[i] = *src;
-      }
-      // ******************************************************************** //
-    } else if (kind == StridedLinearBufferCopy::Kind::FillScatter) {
-      // Scatter `*src` into `dst`.
-      eigen_assert(src_stride == 0 && dst_stride != 1);
-      Packet p = pload1<Packet>(src);
-      for (; i <= vectorized_size; i += PacketSize) {
-        pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
-      }
-      for (; i < count; ++i) {
-        dst[i * dst_stride] = *src;
-      }
-      // ******************************************************************** //
-    } else if (kind == StridedLinearBufferCopy::Kind::Gather) {
-      // Gather from `src` into `dst`.
-      eigen_assert(dst_stride == 1);
-      for (; i <= vectorized_size; i += PacketSize) {
-        Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
-        pstoreu<Scalar, Packet>(dst + i, p);
-      }
-      for (; i < count; ++i) {
-        dst[i] = src[i * src_stride];
-      }
-      // ******************************************************************** //
-    } else if (kind == StridedLinearBufferCopy::Kind::Random) {
-      // Random.
-      for (; i < count; ++i) {
-        dst[i * dst_stride] = src[i * src_stride];
-      }
-    } else {
-      eigen_assert(false);
-    }
-  }
-};
-
-// -------------------------------------------------------------------------- //
-// TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block.
-// It's possible to specify src->dst dimension mapping for the copy operation.
-// Dimensions of `dst` specify how many elements have to be copied, for the
-// `src` we need to know only stride to navigate through source memory buffer.
-
-template <typename Scalar, typename IndexType, int NumDims, int Layout>
-class TensorBlockIO {
-  static const bool IsColMajor = (Layout == ColMajor);
-
-  typedef StridedLinearBufferCopy<Scalar, IndexType> LinCopy;
-
- public:
-  typedef DSizes<IndexType, NumDims> Dimensions;
-  typedef DSizes<int, NumDims> DimensionsMap;
-
-  struct Dst {
-    Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst,
-        IndexType dst_offset = 0)
-        : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {}
-
-    Dimensions dims;
-    Dimensions strides;
-    Scalar* data;
-    IndexType offset;
-  };
-
-  struct Src {
-    Src(const Dimensions& src_strides, const Scalar* src,
-        IndexType src_offset = 0)
-        : strides(src_strides), data(src), offset(src_offset) {}
-
-    Dimensions strides;
-    const Scalar* data;
-    IndexType offset;
-  };
-
-  // Copies data to `dst` from `src`, using provided dimensions mapping:
-  //
-  //   src_dimension_index = dst_to_src_dim_map[dst_dimension_index]
-  //
-  // Returns the number of copied elements.
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy(
-      const Dst& dst, const Src& src, const DimensionsMap& dst_to_src_dim_map) {
-    // Copy single scalar value from `src` to `dst`.
-    if (NumDims == 0) {
-      *(dst.data + dst.offset) = *(src.data + src.offset);
-      return 1;
-    }
-
-    // Both `dst` and `src` must have contiguous innermost dimension. We also
-    // accept the special case with stride '0', because it's used as a trick to
-    // implement broadcasting.
-    {
-      int inner_dim = IsColMajor ? 0 : NumDims - 1;
-      EIGEN_UNUSED_VARIABLE(inner_dim);
-      eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0);
-      eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0);
-    }
-
-    // Give a shorter name to `dst_to_src_dim_map`.
-    const DimensionsMap& dim_map = dst_to_src_dim_map;
-
-    // Do not squeeze reordered inner dimensions.
-    int num_squeezable_dims = NumSqueezableInnerDims(dim_map);
-
-    // NOTE: We find the innermost dimension (contiguous in memory) in the dst
-    // block, and we write data linearly into that dimension, reading it from
-    // the src. If dimensions are reordered, we might end up reading data from
-    // the src with `stride != 1`.
-    //
-    // NOTE: Random-Read/Linear-Write can be up to ~2X faster than
-    // Linear-Read/Random-Write: https://stackoverflow.com/a/54935680
-
-    // Find the innermost dimension in the dst whose size is not 1. This is the
-    // effective inner dim.
-    int num_size_one_inner_dims = 0;
-    for (int i = 0; i < num_squeezable_dims; ++i) {
-      const int dst_dim = IsColMajor ? i : NumDims - i - 1;
-      if (dst.dims[dst_dim] != 1) break;
-      num_size_one_inner_dims++;
-    }
-
-    // If all dimensions are of size 1, just copy a scalar from `src` to `dst`.
-    if (num_size_one_inner_dims == NumDims) {
-      *(dst.data + dst.offset) = *(src.data + src.offset);
-      return 1;
-    }
-
-    // Outermost dimension in the dst with `stride == 1` (contiguous in memory).
-    const int dst_stride1_dim = IsColMajor
-                                    ? num_size_one_inner_dims
-                                    : NumDims - num_size_one_inner_dims - 1;
-
-    // Dimension in the src that corresponds to the dst innermost dimension.
-    const int src_dim_for_dst_stride1_dim =
-        NumDims == 0 ? 1 : dim_map[dst_stride1_dim];
-
-    // Size of the innermost dimension (length of contiguous blocks of memory).
-    IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim];
-
-    // Squeeze multiple inner dims into one if they are contiguous in `dst` and
-    // `src` memory, so we can do less linear copy calls.
-    for (int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) {
-      const int dst_dim = IsColMajor ? i : NumDims - i - 1;
-      const IndexType dst_stride = dst.strides[dst_dim];
-      const IndexType src_stride = src.strides[dim_map[dst_dim]];
-      if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) {
-        dst_inner_dim_size *= dst.dims[dst_dim];
-        ++num_size_one_inner_dims;
-      } else {
-        break;
-      }
-    }
-
-    // Setup strides to read data from `src` and write to `dst`.
-    IndexType input_offset = src.offset;
-    IndexType output_offset = dst.offset;
-    IndexType input_stride =
-        NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim];
-    IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim];
-
-    const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
-    array<BlockIteratorState, at_least_1_dim> it;
-
-    // Initialize block iterator state. Squeeze away any dimension of size 1.
-    int idx = 0;  // currently initialized iterator state index
-    for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
-      const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2;
-      if (dst.dims[dst_dim] == 1) continue;
-
-      it[idx].size = dst.dims[dst_dim];
-      it[idx].input_stride = src.strides[dim_map[dst_dim]];
-      it[idx].output_stride = dst.strides[dst_dim];
-
-      it[idx].input_span = it[idx].input_stride * (it[idx].size - 1);
-      it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
-
-      idx++;
-    }
-
-    // Iterate copying data from src to dst.
-    const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize();
-
-#define COPY_INNER_DIM(KIND)                                           \
-  IndexType num_copied = 0;                                            \
-  for (num_copied = 0; num_copied < block_total_size;                  \
-       num_copied += dst_inner_dim_size) {                             \
-    LinCopy::template Run<KIND>(                                       \
-        typename LinCopy::Dst(output_offset, output_stride, dst.data), \
-        typename LinCopy::Src(input_offset, input_stride, src.data),   \
-        dst_inner_dim_size);                                           \
-                                                                       \
-    for (int j = 0; j < idx; ++j) {                                    \
-      if (++it[j].count < it[j].size) {                                \
-        input_offset += it[j].input_stride;                            \
-        output_offset += it[j].output_stride;                          \
-        break;                                                         \
-      }                                                                \
-      it[j].count = 0;                                                 \
-      input_offset -= it[j].input_span;                                \
-      output_offset -= it[j].output_span;                              \
-    }                                                                  \
-  }                                                                    \
-  return num_copied;
-
-    if (input_stride == 1 && output_stride == 1) {
-      COPY_INNER_DIM(LinCopy::Kind::Linear);
-    } else if (input_stride == 1 && output_stride != 1) {
-      COPY_INNER_DIM(LinCopy::Kind::Scatter);
-    } else if (input_stride == 0 && output_stride == 1) {
-      COPY_INNER_DIM(LinCopy::Kind::FillLinear);
-    } else if (input_stride == 0 && output_stride != 1) {
-      COPY_INNER_DIM(LinCopy::Kind::FillScatter);
-    } else if (output_stride == 1) {
-      COPY_INNER_DIM(LinCopy::Kind::Gather);
-    } else {
-      COPY_INNER_DIM(LinCopy::Kind::Random);
-    }
-
-#undef COPY_INNER_DIM
-  }
-
-  // Copy from `src` to `dst` with an identity src->dst dimension map. Returns
-  // the number of copied elements.
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst& dst,
-                                                              const Src& src) {
-    DimensionsMap dst_to_src_map;
-    for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i;
-    return Copy(dst, src, dst_to_src_map);
-  }
-
- private:
-  struct BlockIteratorState {
-    BlockIteratorState()
-        : size(0),
-          count(0),
-          input_stride(0),
-          output_stride(0),
-          input_span(0),
-          output_span(0) {}
-
-    IndexType size;
-    IndexType count;
-    IndexType input_stride;
-    IndexType output_stride;
-    IndexType input_span;
-    IndexType output_span;
-  };
-
-  // Compute how many inner dimensions it's allowed to squeeze when doing IO
-  // between two tensor blocks. It's safe to squeeze inner dimensions, only
-  // if they are not reordered.
-  static int NumSqueezableInnerDims(const DimensionsMap& dim_map) {
-    int num_squeezable_dims = 0;
-    for (int i = 0; i < NumDims; ++i) {
-      const int dim = IsColMajor ? i : NumDims - i - 1;
-      if (dim_map[dim] != dim) break;
-      num_squeezable_dims++;
-    }
-    return num_squeezable_dims;
-  }
-};
-
-// -------------------------------------------------------------------------- //
-// TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to
-// a Tensor block defined by `desc`, backed by a memory buffer at `target`.
-//
-// Currently there is no way to write from a Tensor expression to a block of
-// memory, if dimensions are reordered. If you need to do that, you should
-// materialize a Tensor block expression into a memory buffer, and then use
-// TensorBlockIO to copy data between two memory buffers with a custom
-// `target->src` dimension map (see definition above).
-//
-// Also currently the innermost dimension of `target` must have a stride '1'
-// (contiguous in memory). This restriction could be lifted with a `pscatter`,
-// but in practice it's never needed, and there is a similar TensorBlockIO
-// workaround for that.
-//
-// TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO
-// where `src` is a tensor expression. Explore if it is possible to rewrite IO
-// to use expressions instead of pointers, and after that TensorBlockAssignment
-// will become an alias to IO.
-template <typename Scalar, int NumDims, typename TensorBlockExpr,
-          typename IndexType = Eigen::Index>
-class TensorBlockAssignment {
-  // We will use coeff/packet path to evaluate block expressions.
-  typedef TensorEvaluator<const TensorBlockExpr, DefaultDevice>
-      TensorBlockEvaluator;
-
-  typedef DSizes<IndexType, NumDims> Dimensions;
-
-  enum {
-    Vectorizable = packet_traits<Scalar>::Vectorizable,
-    PacketSize = packet_traits<Scalar>::size
-  };
-
-  template <bool Vectorizable, typename Evaluator>
-  struct InnerDimAssign {
-    EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
-                                        const Evaluator& eval,
-                                        IndexType eval_offset) {
-      for (IndexType i = 0; i < count; ++i) {
-        target[i] = eval.coeff(eval_offset + i);
-      }
-    }
-  };
-
-  template <typename Evaluator>
-  struct InnerDimAssign<true, Evaluator> {
-    EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
-                                        const Evaluator& eval,
-                                        IndexType eval_offset) {
-      typedef typename packet_traits<Scalar>::type Packet;
-
-      const IndexType unrolled_size = count - 4 * PacketSize;
-      const IndexType vectorized_size = count - PacketSize;
-      IndexType i = 0;
-
-      for (; i <= unrolled_size; i += 4 * PacketSize) {
-        for (int j = 0; j < 4; ++j) {
-          const IndexType idx = eval_offset + i + j * PacketSize;
-          Packet p = eval.template packet<Unaligned>(idx);
-          pstoreu<Scalar>(target + i + j * PacketSize, p);
-        }
-      }
-
-      for (; i <= vectorized_size; i += PacketSize) {
-        Packet p = eval.template packet<Unaligned>(eval_offset + i);
-        pstoreu<Scalar>(target + i, p);
-      }
-
-      for (; i < count; ++i) {
-        target[i] = eval.coeff(eval_offset + i);
-      }
-    }
-  };
-
- public:
-  struct Target {
-    Target(const Dimensions& target_dims, const Dimensions& target_strides,
-           Scalar* target_data, IndexType target_offset = 0)
-        : dims(target_dims),
-          strides(target_strides),
-          data(target_data),
-          offset(target_offset) {}
-
-    Dimensions dims;
-    Dimensions strides;
-    Scalar* data;
-    IndexType offset;
-  };
-
-  static Target target(const Dimensions& target_dims,
-                       const Dimensions& target_strides, Scalar* target_data,
-                       IndexType target_offset = 0) {
-    return Target(target_dims, target_strides, target_data, target_offset);
-  }
-
-  template <typename TargetDimsIndexType, typename TargetStridesIndexType>
-  static Target target(
-      const DSizes<TargetDimsIndexType, NumDims>& target_dims,
-      const DSizes<TargetStridesIndexType, NumDims>& target_strides,
-      Scalar* target_data, IndexType target_offset = 0) {
-    // DSizes constructor will do index type promotion if it's safe.
-    return Target(Dimensions(target_dims), Dimensions(target_strides),
-                  target_data, target_offset);
-  }
-
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      const Target& target, const TensorBlockExpr& expr) {
-    // Prepare evaluator for block expression.
-    DefaultDevice default_device;
-    TensorBlockEvaluator eval(expr, default_device);
-
-    // Tensor block expression dimension should match destination dimensions.
-    eigen_assert(dimensions_match(target.dims, eval.dimensions()));
-
-    static const int Layout = TensorBlockEvaluator::Layout;
-    static const bool is_col_major = Layout == ColMajor;
-
-    // Initialize output inner dimension size based on a layout.
-    const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize();
-    const int inner_dim_idx = is_col_major ? 0 : NumDims - 1;
-    IndexType output_inner_dim_size = target.dims[inner_dim_idx];
-
-    // Target inner dimension stride must be '1'.
-    eigen_assert(target.strides[inner_dim_idx] == 1);
-
-    // Squeeze multiple inner dims into one if they are contiguous in `target`.
-    IndexType num_squeezed_dims = 0;
-    for (Index i = 1; i < NumDims; ++i) {
-      const Index dim = is_col_major ? i : NumDims - i - 1;
-      const IndexType target_stride = target.strides[dim];
-
-      if (output_inner_dim_size == target_stride) {
-        output_inner_dim_size *= target.dims[dim];
-        num_squeezed_dims++;
-      } else {
-        break;
-      }
-    }
-
-    // Initialize output block iterator state. Dimension in this array are
-    // always in inner_most -> outer_most order (col major layout).
-    array<BlockIteratorState, NumDims> it;
-
-    int idx = 0;  // currently initialized iterator state index
-    for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) {
-      const Index dim = is_col_major ? i + 1 : NumDims - i - 2;
-
-      it[idx].count = 0;
-      it[idx].size = target.dims[dim];
-      it[idx].output_stride = target.strides[dim];
-      it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
-      idx++;
-    }
-
-    // We read block expression from the beginning, and start writing data to
-    // `target` at given offset.
-    IndexType input_offset = 0;
-    IndexType output_offset = target.offset;
-
-    // Iterate copying data from `eval` to `target`.
-    for (IndexType i = 0; i < output_size; i += output_inner_dim_size) {
-      // Assign to `target` at current offset.
-      InnerDimAssign<Vectorizable && TensorBlockEvaluator::PacketAccess,
-                     TensorBlockEvaluator>::Run(target.data + output_offset,
-                                                output_inner_dim_size, eval,
-                                                input_offset);
-
-      // Move input offset forward by the number of assigned coefficients.
-      input_offset += output_inner_dim_size;
-
-      // Update index.
-      for (int j = 0; j < idx; ++j) {
-        if (++it[j].count < it[j].size) {
-          output_offset += it[j].output_stride;
-          break;
-        }
-        it[j].count = 0;
-        output_offset -= it[j].output_span;
-      }
-    }
-  }
-
- private:
-  struct BlockIteratorState {
-    BlockIteratorState()
-        : count(0), size(0), output_stride(0), output_span(0) {}
-
-    IndexType count;
-    IndexType size;
-    IndexType output_stride;
-    IndexType output_span;
-  };
-};
-
-// -------------------------------------------------------------------------- //
-
-}  // namespace internal
-}  // namespace Eigen
-
-#endif  // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -1971,9 +1971,9 @@ class TestPow_factor_tensor(TestActivation):
            feed={"x": input},
            fetch_list=[out_1, out_2, res, out_6])

-        assert np.array_equal(res_1, np.power(input, 2))
-        assert np.array_equal(res_2, np.power(input, 3))
-        assert np.array_equal(res_6, np.power(input, 3))
+        assert np.allclose(res_1, np.power(input, 2))
+        assert np.allclose(res_2, np.power(input, 3))
+        assert np.allclose(res_6, np.power(input, 3))

    def test_error(self):
        in1 = fluid.layers.data(