From b89dd86fb65fc9095219aa4d37ad52728b8f689a Mon Sep 17 00:00:00 2001
From: Zhang Ting <709968123@qq.com>
Date: Sat, 18 Apr 2020 13:51:25 +0800
Subject: [PATCH] Update eigen (#23203)

* update eigen, test=develop

* remove patches, test=develop

* add definition of -fabi-version, test=develop

* add patch for TensorBlock.h, test=develop

* test windows, test=develop

* only update eigen for Linux, test=develop

* add code comments, test=develop
---
 cmake/configure.cmake                   |   4 +
 cmake/external/eigen.cmake              |  21 ++-
 paddle/fluid/platform/device_context.cc |  11 ++
 patches/eigen/Geometry_SSE.h            | 189 ++++++++++++++++++++++++
 4 files changed, 223 insertions(+), 2 deletions(-)
 create mode 100644 patches/eigen/Geometry_SSE.h

diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 14a8bede087..b0ce1a4ea2d 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -69,6 +69,10 @@ endif()
 if(WITH_GPU)
     add_definitions(-DPADDLE_WITH_CUDA)
     add_definitions(-DEIGEN_USE_GPU)
+    # The compiler fully support const expressions since c++14,
+    # but Eigen use some const expressions such as std::max and std::min, which are not supported in c++11
+    # use following definition to set EIGEN_HAS_CONSTEXPR=0 to avoid compilation error in c++11
+    add_definitions(-DEIGEN_MAX_CPP_VER=11)
 
     FIND_PACKAGE(CUDA REQUIRED)
 
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 6711442283f..d7f89e4c901 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -14,10 +14,17 @@
 
 include(ExternalProject)
 
+# update eigen to the commit id 4da2c6b1 on 03/19/2020
 set(EIGEN_PREFIX_DIR ${THIRD_PARTY_PATH}/eigen3)
 set(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3/src/extern_eigen3)
-set(EIGEN_REPOSITORY https://github.com/eigenteam/eigen-git-mirror.git)
-set(EIGEN_TAG        917060c364181f33a735dc023818d5a54f60e54c)
+set(EIGEN_REPOSITORY https://gitlab.com/libeigen/eigen.git)
+set(EIGEN_TAG        4da2c6b1974827b1999bab652a3d4703e1992d26)
+
+# the recent version of eigen will cause compilation error on windows
+if(WIN32)
+    set(EIGEN_REPOSITORY https://github.com/eigenteam/eigen-git-mirror.git)
+    set(EIGEN_TAG        917060c364181f33a735dc023818d5a54f60e54c)
+endif()
 
 # eigen on cuda9.1 missing header of math_funtions.hpp
 # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen
@@ -35,6 +42,16 @@ if(WIN32)
     file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Half.h native_src)
     file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/arch/CUDA/Half.h native_dst)
     set(EIGEN_PATCH_COMMAND copy ${native_src} ${native_dst} /Y)
+elseif(LINUX)
+    # For gxx=4.8, __GXX_ABI_VERSION is less than 1004
+    # which will cause a compilation error in Geometry_SSE.h:38:
+    # "no matching function for call to 'pmul(Eigen::internal::Packet4f&, __m128)"
+    # refer to: https://gitlab.com/libeigen/eigen/-/blob/4da2c6b1974827b1999bab652a3d4703e1992d26/Eigen/src/Core/arch/SSE/PacketMath.h#L33-60
+    # add -fabi-version=4 could avoid above error, but will cause "double free corruption" when compile with gcc8
+    # so use following patch to solve compilation error with different version of gcc.
+    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Geometry_SSE.h native_src)
+    file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Geometry/arch/Geometry_SSE.h native_dst)
+    set(EIGEN_PATCH_COMMAND cp ${native_src} ${native_dst})
 endif()
 
 set(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR})
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 8a655254917..a94714ef82a 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -176,15 +176,26 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
 
   void* scratchpad() const override {
     if (scratch_ == NULL) {
+// windows use an old version of eigen that uses kCudaScratchSize,
+// once windows updates eigen to a recent version, the following code
+// can use kGpuScratchSize uniformly
+#ifdef _WIN32
       scratch_ = allocate(Eigen::kCudaScratchSize + sizeof(unsigned int));
+#else
+      scratch_ = allocate(Eigen::kGpuScratchSize + sizeof(unsigned int));
+#endif
     }
     return scratch_;
   }
 
   unsigned int* semaphore() const override {
     if (semaphore_ == NULL) {
+#ifdef _WIN32
       char* scratch =
           static_cast<char*>(scratchpad()) + Eigen::kCudaScratchSize;
+#else
+      char* scratch = static_cast<char*>(scratchpad()) + Eigen::kGpuScratchSize;
+#endif
       semaphore_ = reinterpret_cast<unsigned int*>(scratch);
       PADDLE_ENFORCE_CUDA_SUCCESS(
           cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_));
diff --git a/patches/eigen/Geometry_SSE.h b/patches/eigen/Geometry_SSE.h
new file mode 100644
index 00000000000..f45d5eb8a01
--- /dev/null
+++ b/patches/eigen/Geometry_SSE.h
@@ -0,0 +1,189 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Rohit Garg <rpg.314@gmail.com>
+// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GEOMETRY_SSE_H
+#define EIGEN_GEOMETRY_SSE_H
+
+namespace Eigen {
+
+namespace internal {
+
+template <class Derived, class OtherDerived>
+struct quat_product<Architecture::SSE, Derived, OtherDerived, float> {
+  enum {
+    AAlignment = traits<Derived>::Alignment,
+    BAlignment = traits<OtherDerived>::Alignment,
+    ResAlignment = traits<Quaternion<float>>::Alignment
+  };
+  static inline Quaternion<float> run(const QuaternionBase<Derived>& _a,
+                                      const QuaternionBase<OtherDerived>& _b) {
+    evaluator<typename Derived::Coefficients> ae(_a.coeffs());
+    evaluator<typename OtherDerived::Coefficients> be(_b.coeffs());
+    Quaternion<float> res;
+    const __m128 mask = _mm_setr_ps(0.f, 0.f, 0.f, -0.f);
+    __m128 a = ae.template packet<AAlignment, __m128>(0);
+    __m128 b = be.template packet<BAlignment, __m128>(0);
+    __m128 s1 =
+        pmul(vec4f_swizzle1(a, 1, 2, 0, 2), vec4f_swizzle1(b, 2, 0, 1, 2));
+    __m128 s2 =
+        pmul(vec4f_swizzle1(a, 3, 3, 3, 1), vec4f_swizzle1(b, 0, 1, 2, 1));
+    pstoret<float, __m128, ResAlignment>(
+        &res.x(),
+        padd(psub(pmul(a, vec4f_swizzle1(b, 3, 3, 3, 3)),
+                  pmul(vec4f_swizzle1(a, 2, 0, 1, 0),
+                       vec4f_swizzle1(b, 1, 2, 0, 0))),
+             pxor(mask, padd(s1, s2))));
+
+    return res;
+  }
+};
+
+template <class Derived>
+struct quat_conj<Architecture::SSE, Derived, float> {
+  enum { ResAlignment = traits<Quaternion<float>>::Alignment };
+  static inline Quaternion<float> run(const QuaternionBase<Derived>& q) {
+    evaluator<typename Derived::Coefficients> qe(q.coeffs());
+    Quaternion<float> res;
+    const Packet4f mask = _mm_setr_ps(-0.f, -0.f, -0.f, 0.f);
+    pstoret<float, Packet4f, ResAlignment>(
+        &res.x(),
+        pxor(mask,
+             qe.template packet<traits<Derived>::Alignment, Packet4f>(0)));
+    return res;
+  }
+};
+
+template <typename VectorLhs, typename VectorRhs>
+struct cross3_impl<Architecture::SSE, VectorLhs, VectorRhs, float, true> {
+  enum {
+    ResAlignment =
+        traits<typename plain_matrix_type<VectorLhs>::type>::Alignment
+  };
+  static inline typename plain_matrix_type<VectorLhs>::type run(
+      const VectorLhs& lhs, const VectorRhs& rhs) {
+    evaluator<VectorLhs> lhs_eval(lhs);
+    evaluator<VectorRhs> rhs_eval(rhs);
+    __m128 a =
+        lhs_eval.template packet<traits<VectorLhs>::Alignment, __m128>(0);
+    __m128 b =
+        rhs_eval.template packet<traits<VectorRhs>::Alignment, __m128>(0);
+    __m128 mul1 =
+        pmul(vec4f_swizzle1(a, 1, 2, 0, 3), vec4f_swizzle1(b, 2, 0, 1, 3));
+    __m128 mul2 =
+        pmul(vec4f_swizzle1(a, 2, 0, 1, 3), vec4f_swizzle1(b, 1, 2, 0, 3));
+    typename plain_matrix_type<VectorLhs>::type res;
+    pstoret<float, __m128, ResAlignment>(&res.x(), psub(mul1, mul2));
+    return res;
+  }
+};
+
+template <class Derived, class OtherDerived>
+struct quat_product<Architecture::SSE, Derived, OtherDerived, double> {
+  enum {
+    BAlignment = traits<OtherDerived>::Alignment,
+    ResAlignment = traits<Quaternion<double>>::Alignment
+  };
+
+  static inline Quaternion<double> run(const QuaternionBase<Derived>& _a,
+                                       const QuaternionBase<OtherDerived>& _b) {
+    const Packet2d mask =
+        _mm_castsi128_pd(_mm_set_epi32(0x0, 0x0, 0x80000000, 0x0));
+
+    Quaternion<double> res;
+
+    evaluator<typename Derived::Coefficients> ae(_a.coeffs());
+    evaluator<typename OtherDerived::Coefficients> be(_b.coeffs());
+
+    const double* a = _a.coeffs().data();
+    Packet2d b_xy = be.template packet<BAlignment, Packet2d>(0);
+    Packet2d b_zw = be.template packet<BAlignment, Packet2d>(2);
+    Packet2d a_xx = pset1<Packet2d>(a[0]);
+    Packet2d a_yy = pset1<Packet2d>(a[1]);
+    Packet2d a_zz = pset1<Packet2d>(a[2]);
+    Packet2d a_ww = pset1<Packet2d>(a[3]);
+
+    // two temporaries:
+    Packet2d t1, t2;
+
+    /*
+     * t1 = ww*xy + yy*zw
+     * t2 = zz*xy - xx*zw
+     * res.xy = t1 +/- swap(t2)
+     */
+    t1 = padd(pmul(a_ww, b_xy), pmul(a_yy, b_zw));
+    t2 = psub(pmul(a_zz, b_xy), pmul(a_xx, b_zw));
+#ifdef EIGEN_VECTORIZE_SSE3
+    EIGEN_UNUSED_VARIABLE(mask)
+    pstoret<double, Packet2d, ResAlignment>(&res.x(),
+                                            _mm_addsub_pd(t1, preverse(t2)));
+#else
+    pstoret<double, Packet2d, ResAlignment>(&res.x(),
+                                            padd(t1, pxor(mask, preverse(t2))));
+#endif
+
+    /*
+     * t1 = ww*zw - yy*xy
+     * t2 = zz*zw + xx*xy
+     * res.zw = t1 -/+ swap(t2) = swap( swap(t1) +/- t2)
+     */
+    t1 = psub(pmul(a_ww, b_zw), pmul(a_yy, b_xy));
+    t2 = padd(pmul(a_zz, b_zw), pmul(a_xx, b_xy));
+#ifdef EIGEN_VECTORIZE_SSE3
+    EIGEN_UNUSED_VARIABLE(mask)
+    pstoret<double, Packet2d, ResAlignment>(
+        &res.z(), preverse(_mm_addsub_pd(preverse(t1), t2)));
+#else
+    pstoret<double, Packet2d, ResAlignment>(&res.z(),
+                                            psub(t1, pxor(mask, preverse(t2))));
+#endif
+
+    return res;
+  }
+};
+
+template <class Derived>
+struct quat_conj<Architecture::SSE, Derived, double> {
+  enum { ResAlignment = traits<Quaternion<double>>::Alignment };
+  static inline Quaternion<double> run(const QuaternionBase<Derived>& q) {
+    evaluator<typename Derived::Coefficients> qe(q.coeffs());
+    Quaternion<double> res;
+    const Packet2d mask0 = _mm_setr_pd(-0., -0.);
+    const Packet2d mask2 = _mm_setr_pd(-0., 0.);
+    pstoret<double, Packet2d, ResAlignment>(
+        &res.x(),
+        pxor(mask0,
+             qe.template packet<traits<Derived>::Alignment, Packet2d>(0)));
+    pstoret<double, Packet2d, ResAlignment>(
+        &res.z(),
+        pxor(mask2,
+             qe.template packet<traits<Derived>::Alignment, Packet2d>(2)));
+    return res;
+  }
+};
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_GEOMETRY_SSE_H
-- 
GitLab