diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 8b2e3fc323ce560fbd5f8248584f95ef6a4bb702..b08286533130b6124193f9042869b4c26725eaaa 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -1,16 +1,17 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
 #include "paddle/fluid/framework/tensor_util.h"
 #include <algorithm>
 #include <limits>
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 596eb99e81320cc6d838ee9fed0ff517f8ffeceb..7d368966eba645d0fb8258cbe5468bb33c63d6ec 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -85,7 +85,7 @@ endif()
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor device_memory_aligment)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col)
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc matrix_inverse)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} box_wrapper)
 if (WITH_GPU)
   set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu bert_encoder_functor)
diff --git a/paddle/fluid/operators/inverse_op.cc b/paddle/fluid/operators/inverse_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e73e74ab856abb8e32e996e3d69daf7b0f39bfe7
--- /dev/null
+++ b/paddle/fluid/operators/inverse_op.cc
@@ -0,0 +1,129 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/inverse_op.h"
+#include <string>
+#include <unordered_map>
+
+namespace paddle {
+namespace operators {
+
+class InverseOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "Inverse");
+    OP_INOUT_CHECK(ctx->HasOutput("Output"), "Output", "Output", "Inverse");
+
+    auto input_dims = ctx->GetInputDim("Input");
+    int64_t input_rank = input_dims.size();
+    PADDLE_ENFORCE_GE(
+        input_rank, 2,
+        platform::errors::InvalidArgument(
+            "The dimension of Input(Input) is expected to be no less than 2. "
+            "But recieved: Input(Input)'s dimension = %d, shape = [%s].",
+            input_rank, input_dims));
+    if (input_dims[input_rank - 2] > 0 && input_dims[input_rank - 1] > 0) {
+      PADDLE_ENFORCE_EQ(input_dims[input_rank - 2], input_dims[input_rank - 1],
+                        platform::errors::InvalidArgument(
+                            "The last two dimensions are expected to be equal. "
+                            "But recieved: %d and %d; "
+                            "Input(Input)'s shape = [%s].",
+                            input_dims[input_rank - 2],
+                            input_dims[input_rank - 1], input_dims));
+    }
+
+    ctx->SetOutputDim("Output", input_dims);
+    ctx->ShareLoD("Input", /*->*/ "Output");
+  }
+};
+
+class InverseOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string>& GetInputOutputWithSameType()
+      const override {
+    static std::unordered_map<std::string, std::string> m{
+        {"Input", /*->*/ "Output"}};
+    return m;
+  }
+};
+
+class InverseGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    auto input_grad = framework::GradVarName("Input");
+    auto output_grad = framework::GradVarName("Output");
+
+    OP_INOUT_CHECK(ctx->HasInput("Output"), "Input", "Output", "InverseGrad");
+    OP_INOUT_CHECK(ctx->HasInput(output_grad), "Input", output_grad,
+                   "InverseGrad");
+
+    if (ctx->HasOutput(input_grad)) {
+      ctx->SetOutputDim(input_grad, ctx->GetInputDim(output_grad));
+    }
+  }
+};
+
+class InverseOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput(
+        "Input",
+        "(Tensor) A square matrix (2-D Tensor) or batches of square matrices"
+        " to inverse.");
+    AddOutput("Output", "(Tensor) The inverse of input matrix.");
+    AddComment(R"DOC(
+Inverse Operator
+
+Takes the inverse of the square matrix.
+)DOC");
+  }
+};
+
+template <typename T>
+class InverseGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> grad) const override {
+    grad->SetType(this->ForwardOpType() + "_grad");
+    grad->SetInput("Output", this->Output("Output"));
+    grad->SetInput(framework::GradVarName("Output"),
+                   this->OutputGrad("Output"));
+    grad->SetOutput(framework::GradVarName("Input"), this->InputGrad("Input"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(inverse, ops::InverseOp, ops::InverseOpMaker,
+                  ops::InverseOpInferVarType,
+                  ops::InverseGradOpMaker<paddle::framework::OpDesc>,
+                  ops::InverseGradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(inverse_grad, ops::InverseGradOp);
+
+REGISTER_OP_CPU_KERNEL(
+    inverse, ops::InverseKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::InverseKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    inverse_grad,
+    ops::InverseGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::InverseGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/inverse_op.cu.cc b/paddle/fluid/operators/inverse_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2ca61159f3a306f55ec73d66a00185e1932dac36
--- /dev/null
+++ b/paddle/fluid/operators/inverse_op.cu.cc
@@ -0,0 +1,25 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/inverse_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    inverse, ops::InverseKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::InverseKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    inverse_grad,
+    ops::InverseGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::InverseGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/inverse_op.h b/paddle/fluid/operators/inverse_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..c1859a26f360bfc457f6417071005d4b5bfab949
--- /dev/null
+++ b/paddle/fluid/operators/inverse_op.h
@@ -0,0 +1,70 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/matrix_inverse.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class InverseKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input = context.Input<framework::Tensor>("Input");
+    auto* output = context.Output<framework::Tensor>("Output");
+    output->mutable_data<T>(context.GetPlace());
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    math::MatrixInverseFunctor<DeviceContext, T> mat_inv;
+    mat_inv(dev_ctx, *input, output);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class InverseGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* a_inv = context.Input<framework::Tensor>("Output");
+    auto* a_inv_grad =
+        context.Input<framework::Tensor>(framework::GradVarName("Output"));
+    auto* a_grad =
+        context.Output<framework::Tensor>(framework::GradVarName("Input"));
+
+    if (a_grad) {
+      a_grad->mutable_data<T>(context.GetPlace());
+
+      auto blas = math::GetBlas<DeviceContext, T>(context);
+      auto& dev_ctx = context.template device_context<DeviceContext>();
+      framework::Tensor tmp_out =
+          context.AllocateTmpTensor<T, DeviceContext>(a_inv->dims(), dev_ctx);
+
+      auto mat_dim_a0 =
+          math::CreateMatrixDescriptor(a_inv_grad->dims(), 0, false);
+      auto mat_dim_b0 = math::CreateMatrixDescriptor(a_inv->dims(), 0, true);
+      blas.MatMul(*a_inv_grad, mat_dim_a0, *a_inv, mat_dim_b0, T(1), &tmp_out,
+                  T(0));
+
+      auto mat_dim_a1 = math::CreateMatrixDescriptor(a_inv->dims(), 0, true);
+      auto mat_dim_b1 = math::CreateMatrixDescriptor(tmp_out.dims(), 0, false);
+      blas.MatMul(*a_inv, mat_dim_a1, tmp_out, mat_dim_b1, T(-1), a_grad, T(0));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 77a8aa5d9c7a1691d9c667c26e0bbbba688bb4ef..3a19c7edff3569d503480fd060a6432dc59d2108 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -21,6 +21,9 @@ function(math_library TARGET)
     if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
         list(APPEND cu_srcs ${TARGET}.cu)
     endif()
+    if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc)
+        list(APPEND cu_srcs ${TARGET}.cu.cc)
+    endif()
     if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu)
         list(APPEND hip_srcs ${TARGET}.hip.cu)
     endif()
@@ -68,6 +71,7 @@ math_library(vol2col)
 math_library(prelu)
 math_library(bert_encoder_functor)
 math_library(tree2col DEPS math_function)
+math_library(matrix_inverse)
 
 cc_test(math_function_test SRCS math_function_test.cc DEPS math_function)
 cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor)
diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h
index 5a96e6bb4a196a9dafca4eb6a816ac3237850f49..f8c971954fc4c0b367cc6e62df8f7a596b651b94 100644
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -223,6 +223,19 @@ class Blas {
             CBLAS_DIAG diag, int M, int N, T alpha, const T* A, int lda, T* B,
             int ldb) const;
 
+#ifdef PADDLE_WITH_CUDA
+  template <typename T>
+  void BatchedGETRF(int n, T** a, int* ipiv, int* info, int batch_size) const;
+
+  template <typename T>
+  void BatchedGETRI(int n, const T** a, const int* ipiv, T** a_inv, int* info,
+                    int batch_size) const;
+
+  template <typename T>
+  void BatchedMatInv(int n, const T** a, T** a_inv, int* info,
+                     int batch_size) const;
+#endif
+
  private:
   const DeviceContext& context_;
 };
@@ -361,6 +374,23 @@ class BlasT : private Blas<DeviceContext> {
     Base()->template TRSM<T>(args...);
   }
 
+#ifdef PADDLE_WITH_CUDA
+  template <typename... ARGS>
+  void BatchedGETRF(ARGS... args) const {
+    Base()->template BatchedGETRF<T>(args...);
+  }
+
+  template <typename... ARGS>
+  void BatchedGETRI(ARGS... args) const {
+    Base()->template BatchedGETRI<T>(args...);
+  }
+
+  template <typename... ARGS>
+  void BatchedMatInv(ARGS... args) const {
+    Base()->template BatchedMatInv<T>(args...);
+  }
+#endif
+
  private:
   const Blas<DeviceContext>* Base() const {
     return static_cast<const Blas<DeviceContext>*>(this);
diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h
index e7720a9769936125b5effad4fa803e9120a0c76d..39bddda6caa532df0c6d392a9ca2e76766d38f3e 100644
--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
@@ -93,6 +93,24 @@ struct CUBlas<float> {
   static void TRSM(ARGS... args) {
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasStrsm(args...));
   }
+
+  template <typename... ARGS>
+  static void GETRF_BATCH(ARGS... args) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cublasSgetrfBatched(args...));
+  }
+
+  template <typename... ARGS>
+  static void GETRI_BATCH(ARGS... args) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cublasSgetriBatched(args...));
+  }
+
+  template <typename... ARGS>
+  static void MATINV_BATCH(ARGS... args) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cublasSmatinvBatched(args...));
+  }
 };
 
 template <>
@@ -141,6 +159,24 @@ struct CUBlas<double> {
   static void TRSM(ARGS... args) {
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDtrsm(args...));
   }
+
+  template <typename... ARGS>
+  static void GETRF_BATCH(ARGS... args) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cublasDgetrfBatched(args...));
+  }
+
+  template <typename... ARGS>
+  static void GETRI_BATCH(ARGS... args) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cublasDgetriBatched(args...));
+  }
+
+  template <typename... ARGS>
+  static void MATINV_BATCH(ARGS... args) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cublasDmatinvBatched(args...));
+  }
 };
 
 template <>
@@ -446,6 +482,44 @@ void Blas<platform::CUDADeviceContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
   });
 }
 
+template <>
+template <typename T>
+void Blas<platform::CUDADeviceContext>::BatchedGETRF(int n, T **a, int *ipiv,
+                                                     int *info,
+                                                     int batch_size) const {
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::GETRF_BATCH(handle, n, a, n, ipiv, info, batch_size);
+  });
+}
+
+template <>
+template <typename T>
+void Blas<platform::CUDADeviceContext>::BatchedGETRI(int n, const T **a,
+                                                     const int *ipiv, T **a_inv,
+                                                     int *info,
+                                                     int batch_size) const {
+  PADDLE_ENFORCE_NE(
+      a_inv, a,
+      platform::errors::InvalidArgument(
+          "cuBLAS fuction 'cublas<S/D>getrfBatched' cannot be executed "
+          "in-place. The memory space of output matrix (address: %p) cannot "
+          "overlap memory space of input matrix (address: %p).",
+          a_inv, a));
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::GETRI_BATCH(handle, n, a, n, ipiv, a_inv, n, info, batch_size);
+  });
+}
+
+template <>
+template <typename T>
+void Blas<platform::CUDADeviceContext>::BatchedMatInv(int n, const T **a,
+                                                      T **a_inv, int *info,
+                                                      int batch_size) const {
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::MATINV_BATCH(handle, n, a, n, a_inv, n, info, batch_size);
+  });
+}
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/matrix_inverse.cc b/paddle/fluid/operators/math/matrix_inverse.cc
new file mode 100644
index 0000000000000000000000000000000000000000..25bc5d725e1fd0b3d896ec44a603ce2834b70b44
--- /dev/null
+++ b/paddle/fluid/operators/math/matrix_inverse.cc
@@ -0,0 +1,62 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/matrix_inverse.h"
+#include "Eigen/Core"
+#include "Eigen/LU"
+#include "paddle/fluid/operators/math/blas.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+class MatrixInverseFunctor<platform::CPUDeviceContext, T> {
+  using Matrix =
+      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+  using EigenMatrixMap = Eigen::Map<Matrix>;
+  using ConstEigenMatrixMap = Eigen::Map<const Matrix>;
+
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& a, framework::Tensor* a_inv) {
+    const auto& mat_dims = a.dims();
+    const int rank = mat_dims.size();
+    int n = mat_dims[rank - 1];
+    int batch_size = rank > 2 ? a.numel() / (n * n) : 1;
+
+    const T* a_ptr = a.data<T>();
+    T* a_inv_ptr = a_inv->mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; ++i) {
+      ConstEigenMatrixMap mat(a_ptr + i * n * n, n, n);
+      EigenMatrixMap mat_inv(a_inv_ptr + i * n * n, n, n);
+      Eigen::PartialPivLU<Matrix> lu;
+      lu.compute(mat);
+
+      const T min_abs_pivot = lu.matrixLU().diagonal().cwiseAbs().minCoeff();
+      PADDLE_ENFORCE_GT(
+          min_abs_pivot, static_cast<T>(0),
+          platform::errors::InvalidArgument("Input is not invertible."));
+      mat_inv.noalias() = lu.inverse();
+    }
+  }
+};
+
+template class MatrixInverseFunctor<platform::CPUDeviceContext, float>;
+template class MatrixInverseFunctor<platform::CPUDeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/matrix_inverse.cu.cc b/paddle/fluid/operators/math/matrix_inverse.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8ea4e582ad10c3220b7a27986ec88005e5198b5c
--- /dev/null
+++ b/paddle/fluid/operators/math/matrix_inverse.cu.cc
@@ -0,0 +1,102 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/matrix_inverse.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/operators/math/blas.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+class MatrixInverseFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& a, framework::Tensor* a_inv) {
+    const auto& mat_dims = a.dims();
+    const int rank = mat_dims.size();
+    int n = mat_dims[rank - 1];
+    int batch_size = rank > 2 ? a.numel() / (n * n) : 1;
+
+    memory::allocation::AllocationPtr tmp_gpu_mat_data;
+    const T* gpu_mat = a.data<T>();
+    if (n >= 32) {
+      // Copy all elements of input matrix A to a temporary memory space to
+      // avoid being overriden by getrf.
+      tmp_gpu_mat_data = memory::Alloc(context, a.numel() * sizeof(T));
+      memory::Copy(boost::get<platform::CUDAPlace>(context.GetPlace()),
+                   tmp_gpu_mat_data->ptr(),
+                   boost::get<platform::CUDAPlace>(context.GetPlace()),
+                   a.data<void>(), a.numel() * sizeof(T), context.stream());
+      gpu_mat = reinterpret_cast<const T*>(tmp_gpu_mat_data->ptr());
+    }
+
+    std::vector<const T*> cpu_ptrs(batch_size * 2);
+    for (int i = 0; i < batch_size; ++i) {
+      cpu_ptrs[i] = gpu_mat + i * n * n;
+      cpu_ptrs[i + batch_size] = a_inv->data<T>() + i * n * n;
+    }
+
+    // Copy the addresses of A and A_inv from host to device.
+    memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
+        memory::Alloc(context, cpu_ptrs.size() * sizeof(T*));
+    memory::Copy(boost::get<platform::CUDAPlace>(context.GetPlace()),
+                 tmp_gpu_ptrs_data->ptr(), platform::CPUPlace(),
+                 static_cast<void*>(cpu_ptrs.data()),
+                 cpu_ptrs.size() * sizeof(T*), context.stream());
+    T** gpu_inv_ptrs =
+        reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()) + batch_size;
+
+    // Allocate device memory for info and pivots.
+    int num_ints = n < 32 ? batch_size : batch_size * (n + 1);
+    memory::allocation::AllocationPtr tmp_gpu_info_data =
+        memory::Alloc(context, num_ints * sizeof(int));
+    int* gpu_info_ptr = reinterpret_cast<int*>(tmp_gpu_info_data->ptr());
+
+    auto blas = math::GetBlas<platform::CUDADeviceContext, T>(context);
+
+    // This functions in cuBLAS is intended to be used for matrices of small
+    // sizes where the launch overhead is a significant factor.
+    // TODO(Xreki): call function in cusolver for large matrices.
+    if (n < 32) {
+      // cublas<S/D>matinvBatched is a short cut of cublas<S/D>getrfBatched
+      // plus cublas<S/D>getriBatched.
+      // However it only works if N is less than 32. If not, we need to
+      // go through cublas<S/D>getrfBatched and cublas<S/D>getriBatched.
+      blas.BatchedMatInv(n,
+                         reinterpret_cast<const T**>(tmp_gpu_ptrs_data->ptr()),
+                         gpu_inv_ptrs, gpu_info_ptr, batch_size);
+    } else {
+      // This function performs the LU factorization of each matrix A by the
+      // equation P * A = L * U. L and U are written back to original matrix A,
+      // and diagonal elements of L are discarded.
+      int* gpu_pivot_ptr =
+          reinterpret_cast<int*>(tmp_gpu_info_data->ptr()) + batch_size;
+      blas.BatchedGETRF(n, reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()),
+                        gpu_pivot_ptr, gpu_info_ptr, batch_size);
+
+      blas.BatchedGETRI(n,
+                        reinterpret_cast<const T**>(tmp_gpu_ptrs_data->ptr()),
+                        gpu_pivot_ptr, gpu_inv_ptrs, gpu_info_ptr, batch_size);
+    }
+  }
+};
+
+template class MatrixInverseFunctor<platform::CUDADeviceContext, float>;
+template class MatrixInverseFunctor<platform::CUDADeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/matrix_inverse.h b/paddle/fluid/operators/math/matrix_inverse.h
new file mode 100644
index 0000000000000000000000000000000000000000..f0baf0b250e757315da6e45cdb9d92c1336e9c26
--- /dev/null
+++ b/paddle/fluid/operators/math/matrix_inverse.h
@@ -0,0 +1,34 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename DeviceContext, typename T>
+class MatrixInverseFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& a,
+                  framework::Tensor* a_inv);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h
index 141de2881d3d27d1878f44432f68dd0a41d0f3be..937e200924b71dc8b8c726741e35964ebc65236e 100644
--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
@@ -90,7 +90,9 @@ extern void *cublas_dso_handle;
   __macro(cublasSgetrfBatched);           \
   __macro(cublasSgetriBatched);           \
   __macro(cublasDgetrfBatched);           \
-  __macro(cublasDgetriBatched);
+  __macro(cublasDgetriBatched);           \
+  __macro(cublasSmatinvBatched);          \
+  __macro(cublasDmatinvBatched);
 
 CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
 
diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h
index 914d04e048615527972c889cdb2967a2ae476473..2be95b113b2f243b9813353877ddc61421b1a56c 100644
--- a/paddle/fluid/platform/dynload/mklml.h
+++ b/paddle/fluid/platform/dynload/mklml.h
@@ -1,8 +1,11 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 65984c600d3bd9bba89fc60ed93f87ca6e38219f..897967b0c147b48485d80d497f985c215209d391 100644
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -167,7 +167,7 @@ from .tensor.math import div  #DEFINE_ALIAS
 from .tensor.math import add  #DEFINE_ALIAS
 from .tensor.math import atan  #DEFINE_ALIAS
 from .tensor.math import logsumexp  #DEFINE_ALIAS
-# from .tensor.math import inverse        #DEFINE_ALIAS
+from .tensor.math import inverse  #DEFINE_ALIAS
 from .tensor.math import log1p  #DEFINE_ALIAS
 from .tensor.math import erf  #DEFINE_ALIAS
 from .tensor.math import addcmul  #DEFINE_ALIAS
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index f13a4ceee69c124bfe4d56e67ac222468b349199..1ef63a28bc6cd321366c49dbeb39cdd3e9f46349 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -1221,9 +1221,9 @@ class OpTest(unittest.TestCase):
 
             def err_msg():
                 offset = np.argmax(diff_mat > max_relative_error)
-                return ("%s error, %s variable %s max gradient diff %f over limit %f, "
-                    "the first error element is %d, expected %f, but got %f.") \
-                    % (self.op_type, msg_prefix, name, max_diff, max_relative_error,
+                return ("Operator %s error, %s variable %s (shape: %s, dtype: %s) max gradient diff %e over limit %e, "
+                    "the first error element is %d, expected %e, but got %e.") \
+                    % (self.op_type, msg_prefix, name, str(a.shape), self.dtype, max_diff, max_relative_error,
                     offset, a.flatten()[offset], b.flatten()[offset])
 
             self.assertLessEqual(max_diff, max_relative_error, err_msg())
diff --git a/python/paddle/fluid/tests/unittests/test_inverse_op.py b/python/paddle/fluid/tests/unittests/test_inverse_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..13cb2b1f8b1161cd35cc75dd4589de62196c4525
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_inverse_op.py
@@ -0,0 +1,144 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle
+from op_test import OpTest
+
+
+class TestInverseOp(OpTest):
+    def config(self):
+        self.matrix_shape = [10, 10]
+        self.dtype = "float64"
+
+    def setUp(self):
+        self.op_type = "inverse"
+        self.config()
+
+        np.random.seed(123)
+        mat = np.random.random(self.matrix_shape).astype(self.dtype)
+        inverse = np.linalg.inv(mat)
+
+        self.inputs = {'Input': mat}
+        self.outputs = {'Output': inverse}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_grad(self):
+        self.check_grad(['Input'], 'Output')
+
+
+class TestInverseOpBatched(TestInverseOp):
+    def config(self):
+        self.matrix_shape = [8, 4, 4]
+        self.dtype = "float64"
+
+
+class TestInverseOpLarge(TestInverseOp):
+    def config(self):
+        self.matrix_shape = [32, 32]
+        self.dtype = "float64"
+
+    def test_grad(self):
+        self.check_grad(['Input'], 'Output', max_relative_error=1e-6)
+
+
+class TestInverseOpFP32(TestInverseOp):
+    def config(self):
+        self.matrix_shape = [10, 10]
+        self.dtype = "float32"
+
+    def test_grad(self):
+        self.check_grad(['Input'], 'Output', max_relative_error=1e-2)
+
+
+class TestInverseOpBatchedFP32(TestInverseOpFP32):
+    def config(self):
+        self.matrix_shape = [8, 4, 4]
+        self.dtype = "float32"
+
+
+class TestInverseOpLargeFP32(TestInverseOpFP32):
+    def config(self):
+        self.matrix_shape = [32, 32]
+        self.dtype = "float32"
+
+
+class TestInverseAPI(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_static_result(self, place, with_out=False):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(name="input", shape=[4, 4], dtype="float64")
+            if with_out:
+                out = fluid.data(name="output", shape=[4, 4], dtype="float64")
+            else:
+                out = None
+            result = paddle.inverse(input=input, out=out)
+
+            input_np = np.random.random([4, 4]).astype("float64")
+            result_np = np.linalg.inv(input_np)
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input": input_np},
+                              fetch_list=[result])
+            self.assertTrue(np.allclose(fetches[0], np.linalg.inv(input_np)))
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                input_np = np.random.random([4, 4]).astype("float64")
+                input = fluid.dygraph.to_variable(input_np)
+                result = paddle.inverse(input)
+                self.assertTrue(
+                    np.allclose(result.numpy(), np.linalg.inv(input_np)))
+
+
+class TestInverseAPIError(unittest.TestCase):
+    def test_errors(self):
+        input_np = np.random.random([4, 4]).astype("float64")
+
+        # input must be Variable.
+        self.assertRaises(TypeError, paddle.inverse, input_np)
+
+        # The data type of input must be float32 or float64.
+        for dtype in ["bool", "int32", "int64", "float16"]:
+            input = fluid.data(name='input_' + dtype, shape=[4, 4], dtype=dtype)
+            self.assertRaises(TypeError, paddle.inverse, input)
+
+        # When out is set, the data type must be the same as input.
+        input = fluid.data(name='input_1', shape=[4, 4], dtype="float32")
+        out = fluid.data(name='output', shape=[4, 4], dtype="float64")
+        self.assertRaises(TypeError, paddle.inverse, input, out)
+
+        # The number of dimensions of input must be >= 2.
+        input = fluid.data(name='input_2', shape=[4], dtype="float32")
+        self.assertRaises(ValueError, paddle.inverse, input)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py b/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py
index b83f21e26fd3c28b412284785d465b0fb25b3513..fd3d5f3104f8243fdcae312620742688eb79b854 100644
--- a/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py
@@ -39,7 +39,8 @@ NEED_FIX_FP64_CHECK_GRAD_THRESHOLD_OP_LIST = [
     'spp', \
     'teacher_student_sigmoid_loss', \
     'unpool', \
-    'yolov3_loss'
+    'yolov3_loss', \
+    'inverse'
 ]
 
 NEED_FIX_FP64_CHECK_OUTPUT_THRESHOLD_OP_LIST = ['bilinear_interp']
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index 2232f252bcf4f00165ad8eac1cc4e307bb1fe18d..a3bbc4879e4aa63b44dd803928a0c95ed629eebd 100644
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -145,7 +145,7 @@ from .math import div  #DEFINE_ALIAS
 from .math import add  #DEFINE_ALIAS
 from .math import atan  #DEFINE_ALIAS
 from .math import logsumexp  #DEFINE_ALIAS
-# from .math import inverse        #DEFINE_ALIAS
+from .math import inverse  #DEFINE_ALIAS
 from .math import log1p  #DEFINE_ALIAS
 from .math import erf  #DEFINE_ALIAS
 from .math import addcmul  #DEFINE_ALIAS
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 4c10063237df61a5ea3845140c6295de305d4473..b099c7f865398c267d78a3bb0fda45a7e4afd0dc 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -105,7 +105,7 @@ __all__ = [
         'add',
         'atan',
         'logsumexp',
-#       'inverse',
+        'inverse',
         'log1p',
         'erf',
         'addcmul',
@@ -986,6 +986,7 @@ def mm(input, mat2, out=None, name=None):
                                'Y': mat2}, outputs={'Out': out})
     return out
 
+
 def addmm(input, x, y, alpha=1.0, beta=1.0, name=None):
     """
     **addmm**
@@ -1120,6 +1121,78 @@ def logsumexp(x, dim=None, keepdim=False, out=None, name=None):
     return layers.log(sum_out, name)
 
 
+def inverse(input, out=None, name=None):
+    """
+    Takes the inverse of the square matrix. A square matrix is a matrix with
+    the same number of rows and columns. The input can be a square matrix
+    (2-D Tensor) or batches of square matrices.
+
+    Args:
+        input (Variable): The input Variable which holds a Tensor. The last two
+            dimensions should be equal. When the number of dimensions is
+            greater than 2, it is treated as batches of square matrix. The data
+            type can be float32 and float64.
+        out (Variable, optional): Optional output which can be any created 
+            Variable that meets the requirements to store the result of operation.
+            If out is None, a new Varibale will be create to store the result.
+        name (str, optional): The default value is None. Normally there is no need for
+            user to set this property. For more information,
+            please refer to :ref:`api_guide_Name`
+
+    Returns:
+        Variable: A Tensor holds the inverse of input. The shape and data type
+            is the same as input.
+
+    Examples:
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+            import paddle.fluid as fluid
+
+            mat_np = np.array([[2, 0], [0, 2]]).astype("float32")
+
+            # example for static graph
+            input = fluid.data("input", shape=[2, 2], dtype="float32")
+            out = paddle.inverse(input)
+        
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            results = exe.run(feed={"input": mat_np },
+                              fetch_list=[out.name])
+            print(results[0]) # [[0.5, 0], [0, 0.5]]
+
+            # example for dynamic graph
+            with fluid.dygraph.guard():
+                mat = fluid.dygraph.to_variable(mat_np)
+                inv = paddle.inverse(mat)
+                print(inv) # [[0.5, 0], [0, 0.5]]
+    """
+    if in_dygraph_mode():
+        return core.ops.inverse(input)
+
+    def _check_input(input):
+        check_variable_and_dtype(input, 'input',
+                                 ['float32', 'float64'], 'inverse')
+        if len(input.shape) < 2:
+            raise ValueError(
+                "The input of inverse is expected to be a Tensor whose number "
+                "of dimensions is no less than 2. But reviced: %d, "
+                "input's shape: %s." % (len(input.shape), input.shape))
+
+        if out is not None:
+            check_variable_and_dtype(out, 'out', input.dtype, 'inverse')
+
+    _check_input(input)
+
+    helper = LayerHelper('inverse', **locals())
+    if out is None:
+        out = helper.create_variable_for_type_inference(dtype=input.dtype)
+    helper.append_op(
+        type='inverse', inputs={'Input': [input] }, outputs={'Output': [out]})
+    return out
+
+
 def max(input, dim=None, keep_dim=False, out=None, name=None):
     """
     Computes the maximum of tensor elements over the given dimension.