[cherry-pick]CPU forward calculation replaces Eigen with Lapack (#35916) (#36091)

cherry-pick #35916，CPU前向计算将Eigen替换为Lapack，修改linalg暴露规则

[cherry-pick]CPU forward calculation replaces Eigen with Lapack (#35916) (#36091)
cherry-pick #35916，CPU前向计算将Eigen替换为Lapack，修改linalg暴露规则
effb70f4 · crystal · GitHub · 14cdcde7 · effb70f4 · effb70f4
9 changed file
--- a/paddle/fluid/operators/eigh_op.cc
+++ b/paddle/fluid/operators/eigh_op.cc
@@ -147,18 +147,17 @@ REGISTER_OPERATOR(eigh, ops::EighOp, ops::EignOpMaker,
 REGISTER_OPERATOR(eigh_grad, ops::EighGradOp);

 REGISTER_OP_CPU_KERNEL(
-    eigh, ops::EighKernel<paddle::platform::CPUDeviceContext, float, float>,
-    ops::EighKernel<paddle::platform::CPUDeviceContext, double, double>,
-    ops::EighKernel<paddle::platform::CPUDeviceContext, float,
+    eigh, ops::EighKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::EighKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::EighKernel<paddle::platform::CPUDeviceContext,
                    paddle::platform::complex<float>>,
-    ops::EighKernel<paddle::platform::CPUDeviceContext, double,
+    ops::EighKernel<paddle::platform::CPUDeviceContext,
                    paddle::platform::complex<double>>);

 REGISTER_OP_CPU_KERNEL(
-    eigh_grad,
-    ops::EighGradKernel<paddle::platform::CPUDeviceContext, float, float>,
-    ops::EighGradKernel<paddle::platform::CPUDeviceContext, double, double>,
-    ops::EighGradKernel<paddle::platform::CPUDeviceContext, float,
+    eigh_grad, ops::EighGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::EighGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::EighGradKernel<paddle::platform::CPUDeviceContext,
                        paddle::platform::complex<float>>,
-    ops::EighGradKernel<paddle::platform::CPUDeviceContext, double,
+    ops::EighGradKernel<paddle::platform::CPUDeviceContext,
                        paddle::platform::complex<double>>);
--- a/paddle/fluid/operators/eigh_op.cu
+++ b/paddle/fluid/operators/eigh_op.cu
@@ -16,18 +16,17 @@ limitations under the License. */

 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
-    eigh, ops::EighKernel<paddle::platform::CUDADeviceContext, float, float>,
-    ops::EighKernel<paddle::platform::CUDADeviceContext, double, double>,
-    ops::EighKernel<paddle::platform::CUDADeviceContext, float,
+    eigh, ops::EighKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::EighKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::EighKernel<paddle::platform::CUDADeviceContext,
                    paddle::platform::complex<float>>,
-    ops::EighKernel<paddle::platform::CUDADeviceContext, double,
+    ops::EighKernel<paddle::platform::CUDADeviceContext,
                    paddle::platform::complex<double>>);

 REGISTER_OP_CUDA_KERNEL(
-    eigh_grad,
-    ops::EighGradKernel<paddle::platform::CUDADeviceContext, float, float>,
-    ops::EighGradKernel<paddle::platform::CUDADeviceContext, double, double>,
-    ops::EighGradKernel<paddle::platform::CUDADeviceContext, float,
+    eigh_grad, ops::EighGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::EighGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::EighGradKernel<paddle::platform::CUDADeviceContext,
                        paddle::platform::complex<float>>,
-    ops::EighGradKernel<paddle::platform::CUDADeviceContext, double,
+    ops::EighGradKernel<paddle::platform::CUDADeviceContext,
                        paddle::platform::complex<double>>);
--- a/paddle/fluid/operators/eigh_op.h
+++ b/paddle/fluid/operators/eigh_op.h
@@ -22,7 +22,7 @@ namespace operators {

 using Tensor = framework::Tensor;

-template <typename DeviceContext, typename ValueType, typename T>
+template <typename DeviceContext, typename T>
 class EighKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
@@ -31,15 +31,16 @@ class EighKernel : public framework::OpKernel<T> {
    auto output_v = ctx.Output<Tensor>("Eigenvectors");
    std::string lower = ctx.Attr<std::string>("UPLO");
    bool is_lower = (lower == "L");
-    math::MatrixEighFunctor<DeviceContext, ValueType, T> functor;
+    math::MatrixEighFunctor<DeviceContext, T> functor;
    functor(ctx, *input, output_w, output_v, is_lower, true);
  }
 };

-template <typename DeviceContext, typename ValueType, typename T>
+template <typename DeviceContext, typename T>
 class EighGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
+    using ValueType = math::Real<T>;
    auto& x_grad = *ctx.Output<framework::Tensor>(framework::GradVarName("X"));
    x_grad.mutable_data<T>(ctx.GetPlace());
    auto& output_w = *ctx.Input<Tensor>("Eigenvalues");

--- a/paddle/fluid/operators/math/eigen_values_vectors.h
+++ b/paddle/fluid/operators/math/eigen_values_vectors.h
@@ -14,8 +14,8 @@

 #pragma once

-#include "Eigen/Core"
 #include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/operators/math/lapack_function.h"
 #include "paddle/fluid/operators/svd_helper.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/dynload/cusolver.h"
@@ -25,84 +25,6 @@ namespace paddle {
 namespace operators {
 namespace math {

-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using InputMatrixMap = Eigen::Map<
-    const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using OutputMatrixMap = Eigen::Map<
-    Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
-
-template <typename ValueType>
-inline void ComputeFloatEigenvaluesAndVectors(ValueType *x_data,
-                                              ValueType *eigenvalues_data,
-                                              ValueType *eigenvectors_data,
-                                              int batches, int rows, int cols,
-                                              bool has_vectors) {
-  int stride = rows * cols;
-  for (int i = 0; i < batches; i++) {
-    auto m = InputMatrixMap<ValueType>(x_data + i * stride, rows, cols);
-    auto eigenvalues =
-        OutputMatrixMap<ValueType>(eigenvalues_data + i * rows, 1, rows);
-    auto eigenvectors =
-        OutputMatrixMap<ValueType>(eigenvectors_data + i * stride, rows, cols);
-
-    Eigen::SelfAdjointEigenSolver<Eigen::Matrix<
-        ValueType, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
-        eigen_solver(m, has_vectors ? Eigen::ComputeEigenvectors
-                                    : Eigen::EigenvaluesOnly);
-    PADDLE_ENFORCE_EQ(
-        eigen_solver.info(), Eigen::Success,
-        platform::errors::InvalidArgument(
-            "Self Adjoint Eigen decomposition is not successful. "
-            "The %d-th input matrice might not be not be positive definite.",
-            i));
-
-    eigenvalues = eigen_solver.eigenvalues().transpose();
-    if (has_vectors) {
-      eigenvectors = eigen_solver.eigenvectors();
-    }
-  }
-}
-
-template <typename T, typename ValueType>
-inline void ComputeComplexEigenvaluesAndVectors(T *x_data,
-                                                ValueType *eigenvalues_data,
-                                                T *eigenvectors_data,
-                                                int batches, int rows, int cols,
-                                                bool has_vectors) {
-  using Complex = std::complex<ValueType>;
-  Complex *input = reinterpret_cast<Complex *>(x_data);
-  Complex *eigenvectors_data_ = reinterpret_cast<Complex *>(eigenvectors_data);
-
-  int stride = rows * cols;
-  for (int i = 0; i < batches; i++) {
-    auto m = InputMatrixMap<Complex>(input + i * stride, rows, cols);
-    auto eigenvalues =
-        OutputMatrixMap<ValueType>(eigenvalues_data + i * rows, 1, rows);
-    auto eigenvectors =
-        OutputMatrixMap<Complex>(eigenvectors_data_ + i * stride, rows, cols);
-
-    Eigen::SelfAdjointEigenSolver<
-        Eigen::Matrix<Complex, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
-        eigen_solver(m, has_vectors ? Eigen::ComputeEigenvectors
-                                    : Eigen::EigenvaluesOnly);
-    PADDLE_ENFORCE_EQ(
-        eigen_solver.info(), Eigen::Success,
-        platform::errors::InvalidArgument(
-            "Self Adjoint Eigen decomposition is not successful. "
-            "The %d-th input matrice might not be not be positive definite.",
-            i));
-
-    eigenvalues = eigen_solver.eigenvalues().transpose();
-    if (has_vectors) {
-      eigenvectors = eigen_solver.eigenvectors();
-    }
-  }
-}
-
 inline int64_t GetBatchSize(framework::DDim dims) {
  int64_t batch_size = 1;
  auto dim_size = dims.size();
@@ -112,7 +34,20 @@ inline int64_t GetBatchSize(framework::DDim dims) {
  return batch_size;
 }

-template <typename DeviceContext, typename ValueType, typename T>
+static void CheckEighResult(const int batch, const int info) {
+  PADDLE_ENFORCE_LE(
+      info, 0,
+      platform::errors::PreconditionNotMet(
+          "For batch [%d]: the [%d] off-diagonal elements of an intermediate"
+          "tridiagonal form did not converge to zero",
+          batch, info));
+  PADDLE_ENFORCE_GE(
+      info, 0, platform::errors::PreconditionNotMet(
+                   "For batch [%d]: the [%d] argument had an illegal value",
+                   batch, info));
+}
+
+template <typename DeviceContext, typename T>
 struct MatrixEighFunctor {
  void operator()(const framework::ExecutionContext &ctx, const Tensor &input,
                  Tensor *eigen_values, Tensor *eigen_vectors, bool is_lower,
@@ -122,43 +57,84 @@ struct MatrixEighFunctor {
 // Calculates the eigenvalues and eigenvectors of Hermitian or real
 // symmetric matrices, and uses the variable has_vectors to
 // control whether to return the eigenvectors.
-template <typename ValueType, typename T>
-struct MatrixEighFunctor<platform::CPUDeviceContext, ValueType, T> {
+template <typename T>
+struct MatrixEighFunctor<platform::CPUDeviceContext, T> {
 public:
  void operator()(const framework::ExecutionContext &ctx, const Tensor &input,
                  Tensor *eigen_values, Tensor *eigen_vectors, bool is_lower,
                  bool has_vectors) {
-    auto dims = input.dims();
-    auto output_value_dim = eigen_values->dims();
+    using ValueType = math::Real<T>;
+    auto *out_value = eigen_values->mutable_data<ValueType>(ctx.GetPlace());

-    int64_t batch_size = 1;
-    int dim_size = dims.size();
-    for (int64_t i = 0; i < dim_size - 2; i++) {
-      batch_size *= dims[i];
-    }
    auto dito =
-        DeviceIndependenceTensorOperations<platform::CPUDeviceContext, T>(ctx);
-    Tensor input_tensor;
-    TensorCopy(input, ctx.GetPlace(), &input_tensor);
-    if (!is_lower) {
-      input_tensor = dito.Transpose(input);
-    }
-    int rows = dims[dims.size() - 2];
+        math::DeviceIndependenceTensorOperations<platform::CPUDeviceContext, T>(
+            ctx);

-    auto *value_data =
-        eigen_values->mutable_data<ValueType>(output_value_dim, ctx.GetPlace());
+    Tensor input_trans;
+    // lapack is a column-major storge, transpose make the input to
+    // have a continuous memory layout
+    input_trans = dito.Transpose(input);
+    auto *input_vector = input_trans.data<T>();

-    if (framework::IsComplexType(input_tensor.type())) {
-      auto *x_data = input_tensor.data<T>();
-      auto *vector_data = eigen_vectors->mutable_data<T>(dims, ctx.GetPlace());
-      ComputeComplexEigenvaluesAndVectors<T, ValueType>(
-          x_data, value_data, vector_data, batch_size, rows, rows, has_vectors);
-    } else {
-      auto *x_data = input_tensor.data<ValueType>();
-      auto *vector_data =
-          eigen_vectors->mutable_data<ValueType>(dims, ctx.GetPlace());
-      ComputeFloatEigenvaluesAndVectors<ValueType>(
-          x_data, value_data, vector_data, batch_size, rows, rows, has_vectors);
+    auto dims = input.dims();
+    int dim_size = dims.size();
+    int64_t batch_size = GetBatchSize(dims);
+
+    int vector_stride = dims[dim_size - 1] * dims[dim_size - 2];
+    int values_stride = dims[dim_size - 1];
+    char uplo = is_lower ? 'L' : 'U';
+    char jobz = has_vectors ? 'V' : 'N';
+    auto n = dims[dim_size - 1];
+    auto lda = std::max<int64_t>(1, n);
+    // if work = -1, it means that you need to use the lapack function to query
+    // the optimal value
+    int lwork = -1;      // The length of the array work
+    int lrwork = -1;     // The dimension of the array rwork,rwork is REAL array
+    int liwork = -1;     // The dimension of the array iwork
+    int iwork_opt = -1;  // The optimal length of the array liwork
+    T lwork_opt = static_cast<T>(-1);  // The optimal length of the array work
+    ValueType rwork_opt =
+        static_cast<ValueType>(-1);  // The optimal length of the array rwork
+
+    int info = 0;
+    // Call lapackEigh to get the optimal size of work data
+    math::lapackEigh<T, ValueType>(jobz, uplo, n, input_vector, lda, out_value,
+                                   &lwork_opt, lwork, &rwork_opt, lrwork,
+                                   &iwork_opt, liwork, &info);
+    lwork = std::max<int>(1, static_cast<int>(lwork_opt));
+    liwork = std::max<int>(1, iwork_opt);
+
+    Tensor rwork_tensor;
+    ValueType *rwork_data = nullptr;
+
+    // complex type
+    if (framework::IsComplexType(input.type())) {
+      lrwork = std::max<int>(1, static_cast<int>(rwork_opt));
+      rwork_data = rwork_tensor.mutable_data<ValueType>(
+          framework::make_ddim({lrwork}), ctx.GetPlace());
+    }
+    Tensor iwork_tensor, work_tensor;
+    auto *iwork_data = iwork_tensor.mutable_data<int>(
+        framework::make_ddim({liwork}), ctx.GetPlace());
+    auto *work_data = work_tensor.mutable_data<T>(framework::make_ddim({lwork}),
+                                                  ctx.GetPlace());
+
+    for (auto i = 0; i < batch_size; i++) {
+      auto *value_data = out_value + i * values_stride;
+      auto *input_data = input_vector + i * vector_stride;
+      math::lapackEigh<T, Real<T>>(jobz, uplo, n, input_data, lda, value_data,
+                                   work_data, lwork, rwork_data, lrwork,
+                                   iwork_data, liwork, &info);
+      CheckEighResult(i, info);
+    }
+    if (has_vectors) {
+      PADDLE_ENFORCE_NOT_NULL(eigen_vectors,
+                              platform::errors::InvalidArgument(
+                                  "When has_vectors is true,"
+                                  "the eigenvectors needs to be calculated, "
+                                  "so the eigenvectors must be provided."));
+      input_trans = dito.Transpose(input_trans);
+      eigen_vectors->ShareDataWith(input_trans);
    }
  }
 };
@@ -168,15 +144,22 @@ struct MatrixEighFunctor<platform::CPUDeviceContext, ValueType, T> {
 // Calculates the eigenvalues and eigenvectors of Hermitian or real
 // symmetric matrices on GPU, and uses the variable has_vectors
 // to control whether to return the eigenvectors.
-template <typename ValueType, typename T>
-struct MatrixEighFunctor<platform::CUDADeviceContext, ValueType, T> {
+template <typename T>
+struct MatrixEighFunctor<platform::CUDADeviceContext, T> {
 public:
  void operator()(const framework::ExecutionContext &ctx, const Tensor &input,
                  Tensor *eigen_values, Tensor *eigen_vectors, bool is_lower,
                  bool has_vectors) {
+    using ValueType = math::Real<T>;
    auto *out_value = eigen_values->mutable_data<ValueType>(ctx.GetPlace());
-    auto *out_vector = eigen_vectors->mutable_data<T>(ctx.GetPlace());

+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto dito =
+        math::DeviceIndependenceTensorOperations<platform::CUDADeviceContext,
+                                                 T>(ctx);
+    Tensor input_trans;
+    input_trans = dito.Transpose(input);
+    auto *input_vector = input_trans.data<T>();
    auto &dims = input.dims();
    int dim_size = dims.size();
    int64_t batch_size = GetBatchSize(dims);
@@ -190,14 +173,6 @@ struct MatrixEighFunctor<platform::CUDADeviceContext, ValueType, T> {
    int lda = std::max<int>(1, n);
    auto vector_stride = dims[dim_size - 1] * dims[dim_size - 2];
    auto values_stride = dims[dim_size - 1];
-
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto dito =
-        math::DeviceIndependenceTensorOperations<platform::CUDADeviceContext,
-                                                 T>(ctx);
-    Tensor output_v_var_trans = dito.Transpose(input);
-    TensorCopy(output_v_var_trans, ctx.GetPlace(), eigen_vectors);
-
    int lwork = 0;
    auto info = memory::Alloc(dev_ctx, sizeof(int) * batch_size);
    auto *info_ptr = reinterpret_cast<int *>(info->ptr());
@@ -205,10 +180,8 @@ struct MatrixEighFunctor<platform::CUDADeviceContext, ValueType, T> {
    // When the input type is float32, and the feature value input dimension is
    // greater than or equal to [*,32,32]  and less than or equal to
    // [*,512,512], Syevj has better performance.
-    bool use_syevj =
-        (eigen_vectors->type() == framework::proto::VarType::FP32 &&
-         values_stride >= 32 && values_stride <= 512);
-
+    bool use_syevj = (input.type() == framework::proto::VarType::FP32 &&
+                      values_stride >= 32 && values_stride <= 512);
    syevjInfo_t syevj_params;
    if (use_syevj) {
      PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -216,52 +189,52 @@ struct MatrixEighFunctor<platform::CUDADeviceContext, ValueType, T> {
      PADDLE_ENFORCE_CUDA_SUCCESS(
          platform::dynload::cusolverDnSsyevj_bufferSize(
              dev_ctx.cusolver_dn_handle(), jobz, uplo, n,
-              reinterpret_cast<const float *>(out_vector), lda,
+              reinterpret_cast<const float *>(input_vector), lda,
              reinterpret_cast<const float *>(out_value), &lwork,
              syevj_params));
    } else {
-      EvdBuffer(dev_ctx.cusolver_dn_handle(), jobz, uplo, n, out_vector, lda,
+      EvdBuffer(dev_ctx.cusolver_dn_handle(), jobz, uplo, n, input_vector, lda,
                out_value, &lwork);
    }
-
    auto work = memory::Alloc(dev_ctx, sizeof(T) * lwork);
    auto *work_ptr = reinterpret_cast<T *>(work->ptr());
-
    for (auto i = 0; i < batch_size; i++) {
-      auto vector_data = out_vector + i * vector_stride;
-      auto value_data = out_value + i * values_stride;
+      auto *input_data = input_vector + i * vector_stride;
+      auto *value_data = out_value + i * values_stride;
      auto handle = dev_ctx.cusolver_dn_handle();
      if (use_syevj) {
        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSsyevj(
-            handle, jobz, uplo, n, reinterpret_cast<float *>(vector_data), lda,
+            handle, jobz, uplo, n, reinterpret_cast<float *>(input_data), lda,
            reinterpret_cast<float *>(value_data),
            reinterpret_cast<float *>(work_ptr), lwork, info_ptr,
            syevj_params));
      } else {
-        Evd(handle, jobz, uplo, n, vector_data, lda, value_data, work_ptr,
-            lwork, info_ptr);
+        Evd(handle, jobz, uplo, n, input_data, lda, value_data, work_ptr, lwork,
+            info_ptr);
      }
-      int error_info;
+      int error_info = 0;
      memory::Copy(platform::CPUPlace(), &error_info,
                   BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
                   info_ptr, sizeof(int), dev_ctx.stream());
-      PADDLE_ENFORCE_EQ(
-          error_info, 0,
-          platform::errors::PreconditionNotMet(
-              "For batch [%d]: the [%d] argument had an illegal value", i,
-              error_info));
+      CheckEighResult(i, error_info);
    }

    if (use_syevj) {
      PADDLE_ENFORCE_CUDA_SUCCESS(
          platform::dynload::cusolverDnDestroySyevjInfo(syevj_params));
    }
-
    if (has_vectors) {
-      *eigen_vectors = dito.Transpose(*eigen_vectors);
+      PADDLE_ENFORCE_NOT_NULL(eigen_vectors,
+                              platform::errors::InvalidArgument(
+                                  "When has_vectors is true,"
+                                  "the eigenvectors needs to be calculated,"
+                                  "so the eigenvectors must be provided."));
+      input_trans = dito.Transpose(input_trans);
+      eigen_vectors->ShareDataWith(input_trans);
    }
  }

+  using ValueType = math::Real<T>;
  inline void EvdBuffer(cusolverDnHandle_t handle, cusolverEigMode_t jobz,
                        cublasFillMode_t uplo, int n, const T *A, int lda,
                        const ValueType *W, int *lwork) const;
@@ -271,15 +244,14 @@ struct MatrixEighFunctor<platform::CUDADeviceContext, ValueType, T> {
                  T *work, int lwork, int *devInfo) const;
 };

-#define FUNC_WITH_TYPES(m)                                       \
-  m(float, float, Ssy, float) m(double, double, Dsy, double)     \
-      m(float, paddle::platform::complex<float>, Che, cuComplex) \
-          m(double, paddle::platform::complex<double>, Zhe, cuDoubleComplex)
+#define FUNC_WITH_TYPES(m)                                \
+  m(float, Ssy, float) m(double, Dsy, double)             \
+      m(paddle::platform::complex<float>, Che, cuComplex) \
+          m(paddle::platform::complex<double>, Zhe, cuDoubleComplex)

-#define EVDBUFFER_INSTANCE(ValueType, T, C, CastType)                          \
+#define EVDBUFFER_INSTANCE(T, C, CastType)                                     \
  template <>                                                                  \
-  inline void                                                                  \
-  MatrixEighFunctor<platform::CUDADeviceContext, ValueType, T>::EvdBuffer(     \
+  inline void MatrixEighFunctor<platform::CUDADeviceContext, T>::EvdBuffer(    \
      cusolverDnHandle_t handle, cusolverEigMode_t jobz,                       \
      cublasFillMode_t uplo, int n, const T *A, int lda, const ValueType *W,   \
      int *lwork) const {                                                      \
@@ -291,10 +263,9 @@ struct MatrixEighFunctor<platform::CUDADeviceContext, ValueType, T> {

 FUNC_WITH_TYPES(EVDBUFFER_INSTANCE);

-#define EVD_INSTANCE(ValueType, T, C, CastType)                           \
+#define EVD_INSTANCE(T, C, CastType)                                      \
  template <>                                                             \
-  inline void                                                             \
-  MatrixEighFunctor<platform::CUDADeviceContext, ValueType, T>::Evd(      \
+  inline void MatrixEighFunctor<platform::CUDADeviceContext, T>::Evd(     \
      cusolverDnHandle_t handle, cusolverEigMode_t jobz,                  \
      cublasFillMode_t uplo, int n, T *A, int lda, ValueType *W, T *work, \
      int lwork, int *devInfo) const {                                    \

--- a/paddle/fluid/operators/math/lapack_function.cc
+++ b/paddle/fluid/operators/math/lapack_function.cc
@@ -31,6 +31,49 @@ void lapackLu<float>(int m, int n, float *a, int lda, int *ipiv, int *info) {
  platform::dynload::sgetrf_(&m, &n, a, &lda, ipiv, info);
 }

+// eigh
+template <>
+void lapackEigh<float>(char jobz, char uplo, int n, float *a, int lda, float *w,
+                       float *work, int lwork, float *rwork, int lrwork,
+                       int *iwork, int liwork, int *info) {
+  (void)rwork;   // unused
+  (void)lrwork;  // unused
+  platform::dynload::ssyevd_(&jobz, &uplo, &n, a, &lda, w, work, &lwork, iwork,
+                             &liwork, info);
+}
+
+template <>
+void lapackEigh<double>(char jobz, char uplo, int n, double *a, int lda,
+                        double *w, double *work, int lwork, double *rwork,
+                        int lrwork, int *iwork, int liwork, int *info) {
+  (void)rwork;   // unused
+  (void)lrwork;  // unused
+  platform::dynload::dsyevd_(&jobz, &uplo, &n, a, &lda, w, work, &lwork, iwork,
+                             &liwork, info);
+}
+
+template <>
+void lapackEigh<platform::complex<float>, float>(
+    char jobz, char uplo, int n, platform::complex<float> *a, int lda, float *w,
+    platform::complex<float> *work, int lwork, float *rwork, int lrwork,
+    int *iwork, int liwork, int *info) {
+  platform::dynload::cheevd_(&jobz, &uplo, &n,
+                             reinterpret_cast<std::complex<float> *>(a), &lda,
+                             w, reinterpret_cast<std::complex<float> *>(work),
+                             &lwork, rwork, &lrwork, iwork, &liwork, info);
+}
+
+template <>
+void lapackEigh<platform::complex<double>, double>(
+    char jobz, char uplo, int n, platform::complex<double> *a, int lda,
+    double *w, platform::complex<double> *work, int lwork, double *rwork,
+    int lrwork, int *iwork, int liwork, int *info) {
+  platform::dynload::zheevd_(&jobz, &uplo, &n,
+                             reinterpret_cast<std::complex<double> *>(a), &lda,
+                             w, reinterpret_cast<std::complex<double> *>(work),
+                             &lwork, rwork, &lrwork, iwork, &liwork, info);
+}
+
 // Eig
 template <>
 void lapackEig<double>(char jobvl, char jobvr, int n, double *a, int lda,

--- a/paddle/fluid/operators/math/lapack_function.h
+++ b/paddle/fluid/operators/math/lapack_function.h
@@ -20,12 +20,17 @@ namespace math {

 // LU (for example)
 template <typename T>
-void lapackLu(int m, int n, T *a, int lda, int *ipiv, int *info);
+void lapackLu(int m, int n, T* a, int lda, int* ipiv, int* info);
+
+template <typename T, typename ValueType = T>
+void lapackEigh(char jobz, char uplo, int n, T* a, int lda, ValueType* w,
+                T* work, int lwork, ValueType* rwork, int lrwork, int* iwork,
+                int liwork, int* info);

 template <typename T1, typename T2 = T1>
-void lapackEig(char jobvl, char jobvr, int n, T1 *a, int lda, T1 *w, T1 *vl,
-               int ldvl, T1 *vr, int ldvr, T1 *work, int lwork, T2 *rwork,
-               int *info);
+void lapackEig(char jobvl, char jobvr, int n, T1* a, int lda, T1* w, T1* vl,
+               int ldvl, T1* vr, int ldvr, T1* work, int lwork, T2* rwork,
+               int* info);

 }  // namespace math
 }  // namespace operators

--- a/paddle/fluid/platform/dynload/lapack.h
+++ b/paddle/fluid/platform/dynload/lapack.h
@@ -16,6 +16,7 @@ limitations under the License. */

 #include <complex>
 #include <mutex>
+#include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 #include "paddle/fluid/platform/port.h"

@@ -28,6 +29,22 @@ extern "C" void dgetrf_(int *m, int *n, double *a, int *lda, int *ipiv,
 extern "C" void sgetrf_(int *m, int *n, float *a, int *lda, int *ipiv,
                        int *info);

+// evd
+extern "C" void zheevd_(char *jobz, char *uplo, int *n, std::complex<double> *a,
+                        int *lda, double *w, std::complex<double> *work,
+                        int *lwork, double *rwork, int *lrwork, int *iwork,
+                        int *liwork, int *info);
+extern "C" void cheevd_(char *jobz, char *uplo, int *n, std::complex<float> *a,
+                        int *lda, float *w, std::complex<float> *work,
+                        int *lwork, float *rwork, int *lrwork, int *iwork,
+                        int *liwork, int *info);
+extern "C" void dsyevd_(char *jobz, char *uplo, int *n, double *a, int *lda,
+                        double *w, double *work, int *lwork, int *iwork,
+                        int *liwork, int *info);
+extern "C" void ssyevd_(char *jobz, char *uplo, int *n, float *a, int *lda,
+                        float *w, float *work, int *lwork, int *iwork,
+                        int *liwork, int *info);
+
 // geev
 extern "C" void dgeev_(char *jobvl, char *jobvr, int *n, double *a, int *lda,
                       double *wr, double *wi, double *vl, int *ldvl,
@@ -81,6 +98,10 @@ extern void *lapack_dso_handle;
 #define LAPACK_ROUTINE_EACH(__macro) \
  __macro(dgetrf_);                  \
  __macro(sgetrf_);                  \
+  __macro(zheevd_);                  \
+  __macro(cheevd_);                  \
+  __macro(dsyevd_);                  \
+  __macro(ssyevd_);                  \
  __macro(dgeev_);                   \
  __macro(sgeev_);                   \
  __macro(zgeev_);                   \

--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -106,7 +106,6 @@ from .tensor.linalg import slogdet  # noqa: F401
 from .tensor.linalg import multi_dot  # noqa: F401
 from .tensor.linalg import matrix_power  # noqa: F401
 from .tensor.linalg import svd  # noqa: F401
-from .tensor.linalg import eigh  # noqa: F401
 from .tensor.linalg import pinv  # noqa: F401
 from .tensor.linalg import solve  # noqa: F401
 from .tensor.logic import equal  # noqa: F401

--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -1759,7 +1759,7 @@ def eigh(x, UPLO='L', name=None):

            x_data = np.array([[1, -2j], [2j, 5]])
            x = paddle.to_tensor(x_data)
-            out_value, out_vector = paddle.eigh(x, UPLO='L')
+            out_value, out_vector = paddle.linalg.eigh(x, UPLO='L')
            print(out_value)
            #[0.17157288, 5.82842712]
            print(out_vector)
@@ -1780,7 +1780,7 @@ def eigh(x, UPLO='L', name=None):
            raise ValueError(
                "The input matrix must be batches of square matrices. But received x's dimention: {}".
                format(x_shape))
-        if UPLO is not 'L' and UPLO is not 'U':
+        if UPLO != 'L' and UPLO != 'U':
            raise ValueError(
                "UPLO must be L or U. But received UPLO is: {}".format(UPLO))