Add matrix_rank Op and it's GPU and CPU kernel (#34823)

* init matrix_rank op, add matrix_rank CPU code and test * add GPU kernel, remove svd_eigen.h * add CPU kernel when tol is tensor * add cpu and gpu code when tol is tensor * fix CI-ROCM error * add matrix_rank API describe, fix PR-CI-Py3 error * fix PR-CI-Windows error, add matrix_rank API test * delete useless comments * fix review * add my code in svd_helper.h * update doc commets * remove spaces

Add matrix_rank Op and it's GPU and CPU kernel (#34823)
* init matrix_rank op, add matrix_rank CPU code and test * add GPU kernel, remove svd_eigen.h * add CPU kernel when tol is tensor * add cpu and gpu code when tol is tensor * fix CI-ROCM error * add matrix_rank API describe, fix PR-CI-Py3 error * fix PR-CI-Windows error, add matrix_rank API test * delete useless comments * fix review * add my code in svd_helper.h * update doc commets * remove spaces
eb1fbf12 · 0x45f · GitHub · 0cb413d3 · eb1fbf12 · eb1fbf12
11 changed file
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -183,6 +183,7 @@ function(op_library TARGET)
        list(REMOVE_ITEM miopen_cu_cc_srcs "affine_grid_cudnn_op.cu.cc")
        list(REMOVE_ITEM miopen_cu_cc_srcs "grid_sampler_cudnn_op.cu.cc")
        list(REMOVE_ITEM hip_srcs "cholesky_op.cu")
+        list(REMOVE_ITEM hip_srcs "matrix_rank_op.cu")
        list(REMOVE_ITEM hip_srcs "svd_op.cu")
        list(REMOVE_ITEM hip_srcs "multinomial_op.cu")
        list(REMOVE_ITEM hip_srcs "decode_jpeg_op.cu")

--- a/paddle/fluid/operators/matrix_rank_op.cc
+++ b/paddle/fluid/operators/matrix_rank_op.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/matrix_rank_op.h"
+#include <memory>
+#include <string>
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+#include "paddle/fluid/operators/svd_helper.h"
+
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+using DDim = framework::DDim;
+
+namespace detail {
+static DDim GetInputBatchDim(const DDim& dim_x) {
+  auto x_vec = framework::vectorize(dim_x);
+  if (x_vec.size() == 2) {
+    return framework::make_ddim({1});
+  }
+  x_vec.erase(x_vec.end() - 2, x_vec.end());
+  return framework::make_ddim(x_vec);
+}
+}  // namespace detail
+
+class MatrixRankeOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "MatrixRank");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "MatrixRank");
+    auto dim_x = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_GE(dim_x.size(), 2,
+                      platform::errors::InvalidArgument(
+                          "The dims of input must be greater than 2"));
+
+    bool hermitian = ctx->Attrs().Get<bool>("hermitian");
+    if (hermitian) {
+      int rows = dim_x[dim_x.size() - 2];
+      int cols = dim_x[dim_x.size() - 1];
+      PADDLE_ENFORCE_EQ(rows, cols,
+                        platform::errors::InvalidArgument(
+                            "if hermitian == true, matrix should be n*n"));
+    }
+
+    DDim dim_x_batch = detail::GetInputBatchDim(dim_x);
+    if (ctx->Attrs().Get<bool>(
+            "use_default_tol")) {  // user not input TolTensor and tol
+      ctx->SetOutputDim("Out", dim_x_batch);
+    } else if (ctx->HasInput("TolTensor")) {
+      auto dim_tol = ctx->GetInputDim("TolTensor");
+      if (dim_x_batch == dim_tol) {
+        ctx->SetOutputDim("Out", dim_x_batch);
+      } else {
+        int max_dim = std::max(dim_x_batch.size(), dim_tol.size());
+        int axis = std::abs(dim_x_batch.size() - dim_tol.size());
+        std::vector<int> x_batch_dims_array(max_dim);
+        std::vector<int> tol_dims_array(max_dim);
+        std::vector<int> out_dims_array(max_dim);
+        GetBroadcastDimsArrays(dim_x_batch, dim_tol, x_batch_dims_array.data(),
+                               tol_dims_array.data(), out_dims_array.data(),
+                               max_dim, axis);
+        for (auto& it : out_dims_array) {
+          VLOG(3) << "out dims: " << it;
+        }
+        ctx->SetOutputDim("Out", framework::make_ddim(out_dims_array));
+      }
+    } else {
+      ctx->SetOutputDim("Out", dim_x_batch);
+    }
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    framework::LibraryType library{framework::LibraryType::kPlain};
+    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    return framework::OpKernelType(data_type, ctx.GetPlace(), layout, library);
+  }
+};
+
+class MatrixRankeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor), The input tensor of matrix_rank op.");
+    AddInput("TolTensor", "(optional) Tol tensor, shape is same as X batch.")
+        .AsDispensable();
+    AddOutput("Out", "(Tensor), The output tensor of matrix_rank op.");
+    AddAttr<float>("tol", "(float, optional). tol").SetDefault(0.0f);
+    AddAttr<bool>("use_default_tol",
+                  "represent whether user input TolTensor/tol, if input "
+                  "TolTensor/tol use_default_tol=true, otherwise "
+                  "use_default_tol=false")
+        .SetDefault(true);
+    AddAttr<bool>("hermitian", "(bool, optional). whether is hermitian matrix")
+        .SetDefault(false);
+    AddComment(R"DOC(MatrixRank Operator.
+    This operator is used to perform MatrixRank operation for batched matrics.
+    $$out = matrix_rank(X, tol, hermitian)$$
+    )DOC");
+  }
+};
+
+template <typename T>
+void BatchEigenvalues(const T* x_data, T* eigenvalues_data, int batches,
+                      int rows, int cols, int k) {
+  // Eigen::Matrix API need non-const pointer.
+  T* input = const_cast<T*>(x_data);
+  int stride = rows * cols;
+  for (int i = 0; i < batches; i++) {
+    auto m = Eigen::Map<
+        Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>(
+        input + i * stride, rows, rows);
+    Eigen::SelfAdjointEigenSolver<
+        Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
+        eigen_solver(m);
+    auto eigenvalues = eigen_solver.eigenvalues().cwiseAbs();
+    for (int j = 0; j < k; j++) {
+      *(eigenvalues_data + i * k + j) = eigenvalues[j];
+    }
+  }
+}
+
+template <typename T>
+void BatchSVD(const T* x_data, T* eigenvalues_data, int batches, int rows,
+              int cols, int k) {
+  // Eigen::Matrix API need non-const pointer.
+  T* input = const_cast<T*>(x_data);
+  int stride = rows * cols;
+  Eigen::BDCSVD<
+      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
+      svd;
+  for (int i = 0; i < batches; i++) {
+    auto m = Eigen::Map<
+        Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>(
+        input + i * stride, rows, cols);
+    svd.compute(m);
+    auto res_s = svd.singularValues();
+    for (int j = 0; j < k; j++) {
+      eigenvalues_data[i * k + j] = res_s[j];
+    }
+  }
+}
+
+template <typename T>
+class MatrixRankCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* x = context.Input<Tensor>("X");
+    auto* x_data = x->data<T>();
+    auto* out = context.Output<Tensor>("Out");
+    out->mutable_data<int64_t>(context.GetPlace());
+    bool hermitian = context.Attr<bool>("hermitian");
+
+    auto dim_x = x->dims();
+    auto dim_out = out->dims();
+    int rows = dim_x[dim_x.size() - 2];
+    int cols = dim_x[dim_x.size() - 1];
+    int k = std::min(rows, cols);
+    auto numel = x->numel();
+    int batches = numel / (rows * cols);
+
+    bool use_default_tol = context.Attr<bool>("use_default_tol");
+    const Tensor* atol_tensor = nullptr;
+    Tensor temp_tensor;
+    T rtol_T = 0;
+    if (use_default_tol) {
+      framework::TensorFromVector<T>(std::vector<T>{0},
+                                     context.device_context(), &temp_tensor);
+      atol_tensor = &temp_tensor;
+      rtol_T = std::numeric_limits<T>::epsilon() * std::max(rows, cols);
+    } else if (context.HasInput("TolTensor")) {
+      atol_tensor = context.Input<Tensor>("TolTensor");
+    } else {
+      framework::TensorFromVector<T>(std::vector<T>{context.Attr<float>("tol")},
+                                     context.device_context(), &temp_tensor);
+      atol_tensor = &temp_tensor;
+    }
+
+    Tensor eigenvalue_tensor;
+    auto* eigenvalue_data = eigenvalue_tensor.mutable_data<T>(
+        detail::GetEigenvalueDim(dim_x, k), context.GetPlace());
+    if (hermitian) {
+      BatchEigenvalues<T>(x_data, eigenvalue_data, batches, rows, cols, k);
+    } else {
+      BatchSVD<T>(x_data, eigenvalue_data, batches, rows, cols, k);
+    }
+
+    auto dito_T =
+        math::DeviceIndependenceTensorOperations<platform::CPUDeviceContext, T>(
+            context);
+    std::vector<int> max_eigenvalue_shape = framework::vectorize<int>(
+        detail::RemoveLastDim(eigenvalue_tensor.dims()));
+    Tensor max_eigenvalue_tensor =
+        dito_T.ReduceMax(eigenvalue_tensor, max_eigenvalue_shape);
+
+    Tensor temp_rtol_tensor;
+    framework::TensorFromVector<T>(std::vector<T>{rtol_T}, &temp_rtol_tensor);
+    Tensor rtol_tensor = dito_T.Mul(temp_rtol_tensor, max_eigenvalue_tensor);
+    Tensor tol_tensor;
+    tol_tensor.mutable_data<T>(dim_out, context.GetPlace());
+    ElementwiseComputeEx<GreaterElementFunctor<T>, platform::CPUDeviceContext,
+                         T, T>(context, atol_tensor, &rtol_tensor, -1,
+                               GreaterElementFunctor<T>(), &tol_tensor);
+
+    tol_tensor.Resize(detail::NewAxisDim(tol_tensor.dims(), 1));
+
+    Tensor compare_result;
+    compare_result.mutable_data<int>(detail::NewAxisDim(dim_out, k),
+                                     context.GetPlace());
+
+    int axis = -1;
+    if (eigenvalue_tensor.dims().size() >= tol_tensor.dims().size()) {
+      ElementwiseComputeEx<GreaterThanFunctor<T>, platform::CPUDeviceContext, T,
+                           int>(context, &eigenvalue_tensor, &tol_tensor, axis,
+                                GreaterThanFunctor<T>(), &compare_result);
+    } else {
+      ElementwiseComputeEx<LessThanFunctor<T>, platform::CPUDeviceContext, T,
+                           int>(context, &eigenvalue_tensor, &tol_tensor, axis,
+                                LessThanFunctor<T>(), &compare_result);
+    }
+    auto dito_int =
+        math::DeviceIndependenceTensorOperations<platform::CPUDeviceContext,
+                                                 int64_t>(context);
+    std::vector<int> result_shape = framework::vectorize<int>(dim_out);
+    Tensor result = dito_int.ReduceSum(compare_result, result_shape);
+    out->ShareDataWith(result);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(matrix_rank, ops::MatrixRankeOp, ops::MatrixRankeOpMaker);
+
+REGISTER_OP_CPU_KERNEL(matrix_rank, ops::MatrixRankCPUKernel<float>,
+                       ops::MatrixRankCPUKernel<double>);
--- a/paddle/fluid/operators/matrix_rank_op.cu
+++ b/paddle/fluid/operators/matrix_rank_op.cu
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef PADDLE_WITH_HIP
+// HIP not support cusolver
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+#include "paddle/fluid/operators/math/complex_functors.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/matrix_rank_op.h"
+#include "paddle/fluid/operators/svd_helper.h"
+#include "paddle/fluid/platform/dynload/cusolver.h"
+#include "paddle/fluid/platform/for_range.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+DDim GetUDDim(const DDim& x_dim, int k) {
+  auto x_vec = framework::vectorize(x_dim);
+  x_vec[x_vec.size() - 1] = k;
+  return framework::make_ddim(x_vec);
+}
+
+DDim GetVHDDim(const DDim& x_dim, int k) {
+  auto x_vec = framework::vectorize(x_dim);
+  x_vec[x_vec.size() - 2] = k;
+  return framework::make_ddim(x_vec);
+}
+}  // namespace detail
+
+template <typename T>
+class MatrixRankGPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto& dev_ctx =
+        context.template device_context<platform::CUDADeviceContext>();
+
+    const Tensor* x = context.Input<Tensor>("X");
+    auto* x_data = x->data<T>();
+    auto* out = context.Output<Tensor>("Out");
+    out->mutable_data<int64_t>(context.GetPlace());
+    bool hermitian = context.Attr<bool>("hermitian");
+
+    auto dim_x = x->dims();
+    auto dim_out = out->dims();
+    int rows = dim_x[dim_x.size() - 2];
+    int cols = dim_x[dim_x.size() - 1];
+    int k = std::min(rows, cols);
+    auto numel = x->numel();
+    int batches = numel / (rows * cols);
+
+    bool use_default_tol = context.Attr<bool>("use_default_tol");
+    const Tensor* atol_tensor = nullptr;
+    Tensor temp_tensor;
+    T rtol_T = 0;
+    if (use_default_tol) {
+      framework::TensorFromVector<T>(std::vector<T>{0},
+                                     context.device_context(), &temp_tensor);
+      atol_tensor = &temp_tensor;
+      rtol_T = std::numeric_limits<T>::epsilon() * std::max(rows, cols);
+    } else if (context.HasInput("TolTensor")) {
+      atol_tensor = context.Input<Tensor>("TolTensor");
+    } else {
+      framework::TensorFromVector<T>(std::vector<T>{context.Attr<float>("tol")},
+                                     context.device_context(), &temp_tensor);
+      atol_tensor = &temp_tensor;
+    }
+
+    // Must Copy X once, because the gesvdj will destory the content when exit.
+    Tensor x_tmp;
+    TensorCopy(*x, context.GetPlace(), &x_tmp);
+    auto info = memory::Alloc(dev_ctx, sizeof(int) * batches);
+    int* info_ptr = reinterpret_cast<int*>(info->ptr());
+
+    Tensor eigenvalue_tensor;
+    auto* eigenvalue_data = eigenvalue_tensor.mutable_data<T>(
+        detail::GetEigenvalueDim(dim_x, k), context.GetPlace());
+    if (hermitian) {
+      SyevjBatched(dev_ctx, batches, rows, x_tmp.data<T>(), eigenvalue_data,
+                   info_ptr);
+      platform::ForRange<platform::CUDADeviceContext> for_range(
+          dev_ctx, eigenvalue_tensor.numel());
+      math::AbsFunctor<T> functor(eigenvalue_data, eigenvalue_data,
+                                  eigenvalue_tensor.numel());
+      for_range(functor);
+    } else {
+      Tensor U, VH;
+      auto* u_data =
+          U.mutable_data<T>(detail::GetUDDim(dim_x, k), context.GetPlace());
+      auto* vh_data =
+          VH.mutable_data<T>(detail::GetVHDDim(dim_x, k), context.GetPlace());
+      GesvdjBatched(dev_ctx, batches, cols, rows, k, x_tmp.data<T>(), vh_data,
+                    u_data, eigenvalue_data, info_ptr, 1);
+    }
+
+    auto dito_T =
+        math::DeviceIndependenceTensorOperations<platform::CUDADeviceContext,
+                                                 T>(context);
+    std::vector<int> max_eigenvalue_shape = framework::vectorize<int>(
+        detail::RemoveLastDim(eigenvalue_tensor.dims()));
+    Tensor max_eigenvalue_tensor =
+        dito_T.ReduceMax(eigenvalue_tensor, max_eigenvalue_shape);
+    Tensor temp_rtol_tensor;
+    framework::TensorFromVector<T>(std::vector<T>{rtol_T},
+                                   context.device_context(), &temp_rtol_tensor);
+    Tensor rtol_tensor = dito_T.Mul(temp_rtol_tensor, max_eigenvalue_tensor);
+    Tensor tol_tensor;
+    tol_tensor.mutable_data<T>(dim_out, context.GetPlace());
+    ElementwiseComputeEx<GreaterElementFunctor<T>, platform::CUDADeviceContext,
+                         T, T>(context, atol_tensor, &rtol_tensor, -1,
+                               GreaterElementFunctor<T>(), &tol_tensor);
+
+    tol_tensor.Resize(detail::NewAxisDim(tol_tensor.dims(), 1));
+
+    Tensor compare_result;
+    compare_result.mutable_data<int64_t>(detail::NewAxisDim(dim_out, k),
+                                         context.GetPlace());
+    int axis = -1;
+    if (eigenvalue_tensor.dims().size() >= tol_tensor.dims().size()) {
+      ElementwiseComputeEx<GreaterThanFunctor<T>, platform::CUDADeviceContext,
+                           T, int64_t>(context, &eigenvalue_tensor, &tol_tensor,
+                                       axis, GreaterThanFunctor<T>(),
+                                       &compare_result);
+    } else {
+      ElementwiseComputeEx<LessThanFunctor<T>, platform::CUDADeviceContext, T,
+                           int64_t>(context, &eigenvalue_tensor, &tol_tensor,
+                                    axis, LessThanFunctor<T>(),
+                                    &compare_result);
+    }
+    auto dito_int =
+        math::DeviceIndependenceTensorOperations<platform::CUDADeviceContext,
+                                                 int64_t>(context);
+    std::vector<int> result_shape = framework::vectorize<int>(dim_out);
+    Tensor result = dito_int.ReduceSum(compare_result, result_shape);
+    out->ShareDataWith(result);
+  }
+
+  void GesvdjBatched(const platform::CUDADeviceContext& dev_ctx, int batchSize,
+                     int m, int n, int k, T* A, T* U, T* V, T* S, int* info,
+                     int thin_UV = 1) const;
+
+  void SyevjBatched(const platform::CUDADeviceContext& dev_ctx, int batchSize,
+                    int n, T* A, T* W, int* info) const;
+};
+
+template <>
+void MatrixRankGPUKernel<float>::GesvdjBatched(
+    const platform::CUDADeviceContext& dev_ctx, int batchSize, int m, int n,
+    int k, float* A, float* U, float* V, float* S, int* info,
+    int thin_UV) const {
+  // do not compute singular vectors
+  const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR;
+  gesvdjInfo_t gesvdj_params = NULL;
+  int lda = m;
+  int ldu = m;
+  int ldt = n;
+  int lwork = 0;
+  auto handle = dev_ctx.cusolver_dn_handle();
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      platform::dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params));
+  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSgesvdj_bufferSize(
+      handle, jobz, thin_UV, m, n, A, lda, S, U, ldu, V, ldt, &lwork,
+      gesvdj_params));
+  auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float));
+  float* workspace_ptr = reinterpret_cast<float*>(workspace->ptr());
+  int stride_A = lda * n;
+  int stride_U = ldu * (thin_UV ? k : m);
+  int stride_V = ldt * (thin_UV ? k : n);
+  for (int i = 0; i < batchSize; i++) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSgesvdj(
+        handle, jobz, thin_UV, m, n, A + stride_A * i, lda, S + k * i,
+        U + stride_U * i, ldu, V + stride_V * i, ldt, workspace_ptr, lwork,
+        info, gesvdj_params));
+    int error_info;
+    memory::Copy(platform::CPUPlace(), &error_info,
+                 BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), info,
+                 sizeof(int), dev_ctx.stream());
+    PADDLE_ENFORCE_EQ(
+        error_info, 0,
+        platform::errors::PreconditionNotMet(
+            "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info));
+  }
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      platform::dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params));
+}
+
+template <>
+void MatrixRankGPUKernel<double>::GesvdjBatched(
+    const platform::CUDADeviceContext& dev_ctx, int batchSize, int m, int n,
+    int k, double* A, double* U, double* V, double* S, int* info,
+    int thin_UV) const {
+  // do not compute singular vectors
+  const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR;
+  gesvdjInfo_t gesvdj_params = NULL;
+  int lda = m;
+  int ldu = m;
+  int ldt = n;
+  int lwork = 0;
+  auto handle = dev_ctx.cusolver_dn_handle();
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      platform::dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params));
+  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDgesvdj_bufferSize(
+      handle, jobz, thin_UV, m, n, A, lda, S, U, ldu, V, ldt, &lwork,
+      gesvdj_params));
+  auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double));
+  double* workspace_ptr = reinterpret_cast<double*>(workspace->ptr());
+  int stride_A = lda * n;
+  int stride_U = ldu * (thin_UV ? k : m);
+  int stride_V = ldt * (thin_UV ? k : n);
+  for (int i = 0; i < batchSize; ++i) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDgesvdj(
+        handle, jobz, thin_UV, m, n, A + stride_A * i, lda, S + k * i,
+        U + stride_U * i, ldu, V + stride_V * i, ldt, workspace_ptr, lwork,
+        info, gesvdj_params));
+    // check the error info
+    int error_info;
+    memory::Copy(platform::CPUPlace(), &error_info,
+                 BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), info,
+                 sizeof(int), dev_ctx.stream());
+    PADDLE_ENFORCE_EQ(
+        error_info, 0,
+        platform::errors::PreconditionNotMet(
+            "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info));
+  }
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      platform::dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params));
+}
+
+template <>
+void MatrixRankGPUKernel<float>::SyevjBatched(
+    const platform::CUDADeviceContext& dev_ctx, int batchSize, int n, float* A,
+    float* W, int* info) const {
+  auto handle = dev_ctx.cusolver_dn_handle();
+  // Compute eigenvalues only
+  const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR;
+  // matrix is saved as column-major in cusolver.
+  // numpy and torch use lower triangle to compute eigenvalues, so here use
+  // upper triangle
+  cublasFillMode_t uplo = CUBLAS_FILL_MODE_UPPER;
+  int lda = n;
+  int stride_A = lda * n;
+  int lwork = 0;
+  syevjInfo_t params = NULL;
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      platform::dynload::cusolverDnCreateSyevjInfo(&params));
+  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSsyevj_bufferSize(
+      handle, jobz, uplo, n, A, lda, W, &lwork, params));
+  auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float));
+  float* workspace_ptr = reinterpret_cast<float*>(workspace->ptr());
+  for (int i = 0; i < batchSize; i++) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSsyevj(
+        handle, jobz, uplo, n, A + stride_A * i, lda, W + n * i, workspace_ptr,
+        lwork, info, params));
+
+    int error_info;
+    memory::Copy(platform::CPUPlace(), &error_info,
+                 BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), info,
+                 sizeof(int), dev_ctx.stream());
+    PADDLE_ENFORCE_EQ(
+        error_info, 0,
+        platform::errors::PreconditionNotMet(
+            "For batch [%d]: CUSolver eigenvalues is not zero. [%d]", i,
+            error_info));
+  }
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      platform::dynload::cusolverDnDestroySyevjInfo(params));
+}
+
+template <>
+void MatrixRankGPUKernel<double>::SyevjBatched(
+    const platform::CUDADeviceContext& dev_ctx, int batchSize, int n, double* A,
+    double* W, int* info) const {
+  auto handle = dev_ctx.cusolver_dn_handle();
+  // Compute eigenvalues only
+  const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR;
+  //  upper triangle of A is stored
+  cublasFillMode_t uplo = CUBLAS_FILL_MODE_UPPER;
+  int lda = n;
+  int stride_A = lda * n;
+  int lwork = 0;
+  syevjInfo_t params = NULL;
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      platform::dynload::cusolverDnCreateSyevjInfo(&params));
+  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDsyevj_bufferSize(
+      handle, jobz, uplo, n, A, lda, W, &lwork, params));
+  auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double));
+  double* workspace_ptr = reinterpret_cast<double*>(workspace->ptr());
+
+  for (int i = 0; i < batchSize; i++) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDsyevj(
+        handle, jobz, uplo, n, A + stride_A * i, lda, W + n * i, workspace_ptr,
+        lwork, info, params));
+    int error_info;
+    memory::Copy(platform::CPUPlace(), &error_info,
+                 BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), info,
+                 sizeof(int), dev_ctx.stream());
+    PADDLE_ENFORCE_EQ(
+        error_info, 0,
+        platform::errors::PreconditionNotMet(
+            "For batch [%d]: CUSolver eigenvalues is not zero. [%d]", i,
+            error_info));
+  }
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      platform::dynload::cusolverDnDestroySyevjInfo(params));
+}
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(matrix_rank, ops::MatrixRankGPUKernel<float>,
+                        ops::MatrixRankGPUKernel<double>);
+#endif  // not PADDLE_WITH_HIP
--- a/paddle/fluid/operators/matrix_rank_op.h
+++ b/paddle/fluid/operators/matrix_rank_op.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/tensor.h"
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+
+namespace detail {
+static DDim GetEigenvalueDim(const DDim& dim, int k) {
+  auto vec = framework::vectorize(dim);
+  vec.erase(vec.end() - 2, vec.end());
+  vec.push_back(k);
+  return framework::make_ddim(vec);
+}
+
+static DDim NewAxisDim(const DDim& dim, int k) {
+  auto vec = framework::vectorize(dim);
+  vec.push_back(k);
+  return framework::make_ddim(vec);
+}
+
+static DDim RemoveLastDim(const DDim& dim) {
+  auto vec = framework::vectorize(dim);
+  if (vec.size() <= 1) {
+    return framework::make_ddim({1});
+  }
+  vec.erase(vec.end() - 1, vec.end());
+  return framework::make_ddim(vec);
+}
+}  // namespace detail
+
+template <typename T>
+struct GreaterThanFunctor {
+  HOSTDEVICE int operator()(const T& a, const T& b) const { return a > b; }
+};
+
+template <typename T>
+struct LessThanFunctor {
+  HOSTDEVICE int operator()(const T& a, const T& b) const { return a < b; }
+};
+
+template <typename T>
+struct GreaterElementFunctor {
+  HOSTDEVICE T operator()(const T& a, const T& b) const {
+    if (a > b) {
+      return a;
+    } else {
+      return b;
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/svd_helper.h
+++ b/paddle/fluid/operators/svd_helper.h
@@ -313,6 +313,22 @@ struct DeviceIndependenceTensorOperations {
    return CreateOpRunAndReturnTensor("slice", inputs, attrs, out_shape);
  }

+  framework::Tensor ReduceSum(const framework::Tensor& x,
+                              std::vector<int> out_dim) {
+    framework::AttributeMap attrs;
+    attrs["dim"] = std::vector<int>{-1};
+    NameInTensorMap inputs({{"X", {&x}}});
+    return CreateOpRunAndReturnTensor("reduce_sum", inputs, attrs, out_dim);
+  }
+
+  framework::Tensor ReduceMax(const framework::Tensor& x,
+                              std::vector<int> out_dim) {
+    framework::AttributeMap attrs;
+    attrs["dim"] = std::vector<int>{-1};
+    NameInTensorMap inputs({{"X", {&x}}});
+    return CreateOpRunAndReturnTensor("reduce_max", inputs, attrs, out_dim);
+  }
+
 private:
  const framework::ExecutionContext& context;
  BlasT<DeviceContext, T> GetBlas() {

--- a/paddle/fluid/platform/dynload/cusolver.cc
+++ b/paddle/fluid/platform/dynload/cusolver.cc
@@ -28,6 +28,11 @@ CUSOLVER_ROUTINE_EACH(DEFINE_WRAP);
 #ifdef CUSOLVER_ROUTINE_EACH_R1
 CUSOLVER_ROUTINE_EACH_R1(DEFINE_WRAP);
 #endif
+
+#ifdef CUSOLVER_ROUTINE_EACH_R2
+CUSOLVER_ROUTINE_EACH_R2(DEFINE_WRAP);
+#endif
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/dynload/cusolver.h
+++ b/paddle/fluid/platform/dynload/cusolver.h
@@ -66,6 +66,18 @@ CUSOLVER_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP);
 CUSOLVER_ROUTINE_EACH_R1(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP)
 #endif

+#if CUDA_VERSION >= 9020
+#define CUSOLVER_ROUTINE_EACH_R2(__macro) \
+  __macro(cusolverDnCreateSyevjInfo);     \
+  __macro(cusolverDnSsyevj_bufferSize);   \
+  __macro(cusolverDnDsyevj_bufferSize);   \
+  __macro(cusolverDnSsyevj);              \
+  __macro(cusolverDnDsyevj);              \
+  __macro(cusolverDnDestroySyevjInfo);
+
+CUSOLVER_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP)
+#endif
+
 #undef DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP
 }  // namespace dynload
 }  // namespace platform

--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -67,7 +67,7 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
    {"sparse_momentum", {"Param", "Grad", "Velocity", "Index", "LearningRate"}},
    {"rnn", {"Input", "PreState", "WeightList", "SequenceLength"}},
    {"run_program", {"X", "Params"}},
-};
+    {"matrix_rank", {"X", "TolTensor"}}};

 // NOTE(zhiqiu): Like op_ins_map.
 // Commonly, the outputs in auto-generated OP function are determined by the

--- a/python/paddle/fluid/tests/unittests/test_matrix_rank_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matrix_rank_op.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+
+import numpy as np
+
+from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid import compiler, Program, program_guard
+
+paddle.enable_static()
+SEED = 2049
+np.random.seed(SEED)
+
+
+class TestMatrixRankOP(OpTest):
+    def setUp(self):
+        self.op_type = "matrix_rank"
+        self.init_data()
+        self.inputs = {'X': self.x}
+        self.attrs = {'hermitian': self.hermitian}
+        if self.tolTensor is not None:
+            self.inputs["TolTensor"] = self.tolTensor
+        if self.tol is not None:
+            self.attrs["tol"] = self.tol
+        self.attrs["use_default_tol"] = self.use_default_tol
+        self.outputs = {'Out': self.out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def init_data(self):
+        self.x = np.eye(3, dtype=np.float32)
+        self.tolTensor = None
+        self.tol = 0.1
+        self.use_default_tol = False
+        self.hermitian = True
+        self.out = np.linalg.matrix_rank(self.x, self.tol, self.hermitian)
+
+
+class TestMatrixRankOP1(TestMatrixRankOP):
+    def init_data(self):
+        self.x = np.eye(3, k=1, dtype=np.float64)
+        self.tolTensor = None
+        self.tol = None
+        self.use_default_tol = True
+        self.hermitian = False
+        self.out = np.linalg.matrix_rank(self.x, self.tolTensor, self.hermitian)
+
+
+class TestMatrixRankOP2(TestMatrixRankOP):
+    def init_data(self):
+        self.x = np.random.rand(3, 4, 5, 6).astype(np.float32)
+        self.tolTensor = np.random.random([3, 4]).astype(self.x.dtype)
+        self.tol = None
+        self.use_default_tol = False
+        self.hermitian = False
+        self.out = np.linalg.matrix_rank(self.x, self.tolTensor, self.hermitian)
+
+
+class TestMatrixRankOP3(TestMatrixRankOP):
+    def init_data(self):
+        self.x = np.eye(200, dtype=np.float64)
+        self.tolTensor = None
+        self.tol = None
+        self.use_default_tol = True
+        self.hermitian = True
+        self.out = np.linalg.matrix_rank(self.x, self.tolTensor, self.hermitian)
+
+
+class TestMatrixRankOP4(TestMatrixRankOP):
+    def init_data(self):
+        self.x = np.random.rand(1, 10).astype(np.float32)
+        self.tolTensor = None
+        self.tol = None
+        self.use_default_tol = True
+        self.hermitian = False
+        self.out = np.linalg.matrix_rank(self.x, self.tolTensor, self.hermitian)
+
+
+class TestMatrixRankOP5(TestMatrixRankOP):
+    def init_data(self):
+        self.x = np.random.rand(5, 1).astype(np.float64)
+        self.tolTensor = np.random.random([1, 4]).astype(self.x.dtype)
+        self.tol = None
+        self.use_default_tol = False
+        self.hermitian = False
+        self.out = np.linalg.matrix_rank(self.x, self.tolTensor, self.hermitian)
+
+
+class TestMatrixRankAPI(unittest.TestCase):
+    def test_dygraph(self):
+        paddle.disable_static()
+
+        x_np = np.eye(10, dtype=np.float32)
+        x_pd = paddle.to_tensor(x_np)
+        rank_np = np.linalg.matrix_rank(x_np, hermitian=True)
+        rank_pd = paddle.linalg.matrix_rank(x_pd, hermitian=True)
+        self.assertTrue(np.allclose(rank_np, rank_pd))
+
+        x_np = np.random.rand(3, 4, 7, 8).astype(np.float64)
+        tol_np = np.random.random([3, 4]).astype(np.float32)
+        x_pd = paddle.to_tensor(x_np)
+        tol_pd = paddle.to_tensor(tol_np)
+        rank_np = np.linalg.matrix_rank(x_np, tol_np, hermitian=False)
+        rank_pd = paddle.linalg.matrix_rank(x_pd, tol_pd, hermitian=False)
+        self.assertTrue(np.allclose(rank_np, rank_pd))
+
+        x_np = np.random.rand(3, 4, 7, 8).astype(np.float64)
+        x_pd = paddle.to_tensor(x_np)
+        tol = 0.1
+        rank_np = np.linalg.matrix_rank(x_np, tol, hermitian=False)
+        rank_pd = paddle.linalg.matrix_rank(x_pd, tol, hermitian=False)
+        self.assertTrue(np.allclose(rank_np, rank_pd))
+
+    def test_static(self):
+        paddle.enable_static()
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+
+        for place in places:
+            with fluid.program_guard(fluid.Program(), fluid.Program()):
+                x_np = np.random.rand(3, 4, 7, 7).astype(np.float64)
+                tol_np = np.random.random([3, 4]).astype(np.float32)
+                x_pd = paddle.fluid.data(
+                    name="X", shape=[3, 4, 7, 7], dtype='float64')
+                tol_pd = paddle.fluid.data(
+                    name="TolTensor", shape=[3, 4], dtype='float32')
+                rank_np = np.linalg.matrix_rank(x_np, tol_np, hermitian=False)
+                rank_pd = paddle.linalg.matrix_rank(
+                    x_pd, tol_pd, hermitian=False)
+                exe = fluid.Executor(place)
+                fetches = exe.run(fluid.default_main_program(),
+                                  feed={"X": x_np,
+                                        "TolTensor": tol_np},
+                                  fetch_list=[rank_pd])
+                self.assertTrue(np.allclose(fetches[0], rank_np))
+
+        for place in places:
+            with fluid.program_guard(fluid.Program(), fluid.Program()):
+                x_np = np.random.rand(3, 4, 7, 7).astype(np.float64)
+                x_pd = paddle.fluid.data(
+                    name="X", shape=[3, 4, 7, 7], dtype='float64')
+                rank_np = np.linalg.matrix_rank(x_np, hermitian=True)
+                rank_pd = paddle.linalg.matrix_rank(x_pd, hermitian=True)
+                exe = fluid.Executor(place)
+                fetches = exe.run(fluid.default_main_program(),
+                                  feed={"X": x_np},
+                                  fetch_list=[rank_pd])
+                self.assertTrue(np.allclose(fetches[0], rank_np))
+
+        for place in places:
+            with fluid.program_guard(fluid.Program(), fluid.Program()):
+                x_np = np.random.rand(3, 4, 7, 7).astype(np.float64)
+                x_pd = paddle.fluid.data(
+                    name="X", shape=[3, 4, 7, 7], dtype='float64')
+                rank_np = np.linalg.matrix_rank(x_np, 0.1, hermitian=False)
+                rank_pd = paddle.linalg.matrix_rank(x_pd, 0.1, hermitian=False)
+                exe = fluid.Executor(place)
+                fetches = exe.run(fluid.default_main_program(),
+                                  feed={"X": x_np},
+                                  fetch_list=[rank_pd])
+                self.assertTrue(np.allclose(fetches[0], rank_np))
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/linalg.py
+++ b/python/paddle/linalg.py
@@ -16,12 +16,14 @@ from .tensor.linalg import cholesky  # noqa: F401
 from .tensor.linalg import norm  # noqa: F401
 from .tensor.linalg import matrix_power  # noqa: F401
 from .tensor import inverse as inv  # noqa: F401
+from .tensor.linalg import matrix_rank
 from .tensor.linalg import svd

 __all__ = [
    'cholesky',  #noqa
    'norm',
    'inv',
+    'matrix_rank',
    'svd',
    'matrix_power'
 ]
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -15,9 +15,9 @@
 import numpy as np
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype, check_type
-from ..fluid.framework import in_dygraph_mode, _varbase_creator
+from ..fluid.framework import in_dygraph_mode, _varbase_creator, Variable

-from ..fluid.layers import transpose  # noqa: F401
+from ..fluid.layers import transpose, cast  # noqa: F401
 from paddle.common_ops_import import core
 from paddle.common_ops_import import VarDesc
 from paddle import _C_ops
@@ -785,6 +785,94 @@ def cholesky(x, upper=False, name=None):
    return out


+def matrix_rank(x, tol=None, hermitian=False, name=None):
+    r"""
+    Computes the rank of a matrix.
+
+    The rank of a matrix is the number of singular values that are greater than the specified tol threshold when hermitian=False, 
+    or the number of eigenvalues in absolute value that are greater than the specified tol threshold when hermitian=True.
+
+    Args:
+        x (Tensor): The input tensor. 
+            Its shape should be [..., m, n], where ... is zero or more batch dimensions. If x is a batch of matrices then the output 
+            has the same batch dimensions. The data type of x should be float32 or float64. 
+        tol (float,Tensor,optional): the tolerance value. Default: None. 
+            If tol is not specified, and sigma is the largest singular value (or eigenvalue in absolute value), and eps is the 
+            epsilon value for the dtype of x, then tol is computed with formula tol=sigma * max(m,n) * eps. Note that if x is 
+            a batch of matrices, tol is computed this way for every batch.
+        hermitian (bool,optional): indicates whether x is Hermitian. Default: False.
+            When hermitian=True, x is assumed to be Hermitian, but x is not checked inside the function. Instead, We just use the 
+            lower triangular of the matrix to compute.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor: Rank of tensor x.
+    
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            a = paddle.eye(10)
+            b = paddle.linalg.matrix_rank(a)
+            print(b)
+            # b = [10]
+
+            c = paddle.ones(shape=[3, 4, 5, 5])
+            d = paddle.linalg.matrix_rank(c, tol=0.01, hermitian=True)
+            print(d)
+            # d = [[1, 1, 1, 1],
+            #      [1, 1, 1, 1],
+            #      [1, 1, 1, 1]]
+    
+    """
+
+    if in_dygraph_mode():
+        if tol is None:
+            tol_tensor = None
+            tol_attr = 0.0
+            use_default_tol = True
+        elif isinstance(tol, Variable):
+            if tol.dtype != x.dtype:
+                tol_tensor = cast(tol, x.dtype)
+            else:
+                tol_tensor = tol
+            tol_attr = 0.0
+            use_default_tol = False
+        else:
+            tol_tensor = None
+            tol_attr = float(tol)
+            use_default_tol = False
+        return _C_ops.matrix_rank(x, tol_tensor, "tol", tol_attr, 'hermitian',
+                                  hermitian, 'use_default_tol', use_default_tol)
+
+    inputs = {}
+    attrs = {}
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'matrix_rank')
+    inputs['X'] = x
+    if tol is None:
+        attrs['use_default_tol'] = True
+    elif isinstance(tol, Variable):
+        check_variable_and_dtype(tol, 'tol', ['float32'], 'matrix_rank')
+        attrs['use_default_tol'] = False
+        if tol.dtype != x.dtype:
+            inputs['TolTensor'] = cast(tol, x.dtype)
+        else:
+            inputs['TolTensor'] = tol
+    else:
+        check_type(tol, 'tol', float, 'matrix_rank')
+        attrs['use_default_tol'] = False
+        attrs['tol'] = tol
+    check_type(hermitian, 'hermitian', bool, 'matrix_rank')
+    attrs['hermitian'] = hermitian
+
+    helper = LayerHelper('matrix_rank', **locals())
+    out = helper.create_variable_for_type_inference(dtype='int32')
+    helper.append_op(
+        type='matrix_rank', inputs=inputs, outputs={'Out': out}, attrs=attrs)
+    return out
+
+
 def bmm(x, y, name=None):
    """
    Applies batched matrix multiplication to two tensors.