[Cherry-pick] Add the forward QR operator (#36627)

616ce203 · Yulong Ao · GitHub · fced11bd · 616ce203 · 616ce203
10 changed file
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -185,6 +185,7 @@ function(op_library TARGET)
        list(REMOVE_ITEM hip_srcs "cholesky_op.cu")
        list(REMOVE_ITEM hip_srcs "matrix_rank_op.cu")
        list(REMOVE_ITEM hip_srcs "svd_op.cu")
+        list(REMOVE_ITEM hip_srcs "qr_op.cu")
        list(REMOVE_ITEM hip_srcs "eigh_op.cu")
        list(REMOVE_ITEM hip_srcs "multinomial_op.cu")
        list(REMOVE_ITEM hip_srcs "decode_jpeg_op.cu")

--- a/paddle/fluid/operators/qr_op.cc
+++ b/paddle/fluid/operators/qr_op.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/qr_op.h"
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/framework/ddim.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+using DDim = framework::DDim;
+
+class QrOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "qr");
+    OP_INOUT_CHECK(ctx->HasOutput("Q"), "Output", "Q", "qr");
+    OP_INOUT_CHECK(ctx->HasOutput("R"), "Output", "R", "qr");
+
+    auto x_dims = ctx->GetInputDim("X");
+    int x_rank = x_dims.size();
+    PADDLE_ENFORCE_GE(x_dims.size(), 2,
+                      platform::errors::InvalidArgument(
+                          "the rank of input must greater than 2"));
+    bool compute_q;
+    bool reduced_mode;
+    int m = x_dims[x_rank - 2];
+    int n = x_dims[x_rank - 1];
+    int min_mn = std::min(m, n);
+    std::string mode = ctx->Attrs().Get<std::string>("mode");
+    std::tie(compute_q, reduced_mode) = _parse_qr_mode(mode);
+
+    if (compute_q) {
+      int k = reduced_mode ? min_mn : m;
+      auto q_dims_vec = framework::vectorize(x_dims);
+      q_dims_vec[q_dims_vec.size() - 1] = k;
+      ctx->SetOutputDim("Q", framework::make_ddim(q_dims_vec));
+    } else {
+      ctx->SetOutputDim("Q", framework::make_ddim({0}));
+    }
+
+    int k = reduced_mode ? min_mn : m;
+    auto r_dims_vec = framework::vectorize(x_dims);
+    r_dims_vec[r_dims_vec.size() - 2] = k;
+    r_dims_vec[r_dims_vec.size() - 1] = n;
+    ctx->SetOutputDim("R", framework::make_ddim(r_dims_vec));
+
+    ctx->ShareLoD("X", /*->*/ "Q");
+    ctx->ShareLoD("X", /*->*/ "R");
+  }
+};
+
+class QrOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor), The input tensor of qr op.");
+    AddOutput("Q", "(Tensor), The output Q tensor of qr op.");
+    AddOutput("R", "(Tensor), The output R tensor of qr op.");
+    AddAttr<std::string>(
+        "mode",
+        "(string, default \"reduced\"). "
+        "If mode is \"reduced\", Qr op will return reduced Q and R matrices. "
+        "If mode is \"complete\", Qr op will return complete Q and R matrices. "
+        "If mode is \"r\", Qr op will only return reduced R matrix.")
+        .SetDefault("reduced");
+    AddComment(R"DOC(
+Qr Operator.
+
+This operator is used to perform QR operation for batched matrics $X$.
+$$Q, R = qr(X)$$
+
+)DOC");
+  }
+};
+
+class QrGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Q")), "Input",
+                   "Q@Grad", "QrGrad");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("R")), "Input",
+                   "R@Grad", "QrGrad");
+    OP_INOUT_CHECK(ctx->HasInput("Q"), "Input", "Q", "QrGrad");
+    OP_INOUT_CHECK(ctx->HasInput("R"), "Input", "R", "QrGrad");
+    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output",
+                   "X@Grad", "QrGrad");
+
+    auto x_dims = ctx->GetInputDim(("X"));
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    return framework::OpKernelType(dtype, ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class QrGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+  void Apply(GradOpPtr<T> retv) const override {
+    retv->SetType("qr_grad");
+    retv->SetInput(framework::GradVarName("Q"), this->OutputGrad("Q"));
+    retv->SetInput(framework::GradVarName("R"), this->OutputGrad("R"));
+    retv->SetInput("Q", this->Output("Q"));
+    retv->SetInput("R", this->Output("R"));
+    retv->SetInput("X", this->Input("X"));
+    retv->SetAttrMap(this->Attrs());
+    retv->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(qr, ops::QrOp, ops::QrOpMaker,
+                  ops::QrGradMaker<paddle::framework::OpDesc>,
+                  ops::QrGradMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(qr_grad, ops::QrGradOp);
+
+REGISTER_OP_CPU_KERNEL(qr, ops::QrCPUKernel<float>, ops::QrCPUKernel<double>);
+
+REGISTER_OP_CPU_KERNEL(
+    qr_grad, ops::QrGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::QrGradKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/qr_op.cu
+++ b/paddle/fluid/operators/qr_op.cu
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef PADDLE_WITH_HIP
+// HIP not support cusolver
+
+#include <thrust/device_vector.h>
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/operators/qr_op.h"
+#include "paddle/fluid/platform/dynload/cusolver.h"
+
+// Reuse some helper functions from svd
+#include "paddle/fluid/operators/svd_helper.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class QrGPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    bool compute_q;
+    bool reduced_mode;
+    auto& dev_ctx =
+        context.template device_context<platform::CUDADeviceContext>();
+    const Tensor& x = *context.Input<Tensor>("X");
+    Tensor& q = *context.Output<Tensor>("Q");
+    Tensor& r = *context.Output<Tensor>("R");
+    const std::string mode = context.Attr<std::string>("mode");
+    std::tie(compute_q, reduced_mode) = _parse_qr_mode(mode);
+
+    auto numel = x.numel();
+    PADDLE_ENFORCE_GT(numel, 0, platform::errors::PreconditionNotMet(
+                                    "The input of QR is empty."));
+    auto x_dims = x.dims();
+    int x_rank = x_dims.size();
+    int m = x_dims[x_rank - 2];
+    int n = x_dims[x_rank - 1];
+    int min_mn = std::min(m, n);
+    int k = reduced_mode ? min_mn : m;
+    int batch_size = numel / (m * n);
+    int qr_stride = m * n;
+    int tau_stride = min_mn;
+
+    if (compute_q) {
+      q.mutable_data<math::Real<T>>(
+          context.GetPlace(),
+          size_t(batch_size * m * k * sizeof(math::Real<T>)));
+    }
+    r.mutable_data<math::Real<T>>(
+        context.GetPlace(), size_t(batch_size * k * n * sizeof(math::Real<T>)));
+
+    auto dito =
+        math::DeviceIndependenceTensorOperations<platform::CUDADeviceContext,
+                                                 T>(context);
+
+    // Note: allocate temporary tensors because of lacking in-place operatios.
+    // Prepare qr
+    Tensor qr;
+    qr.mutable_data<math::Real<T>>(
+        context.GetPlace(), size_t(batch_size * m * n * sizeof(math::Real<T>)));
+    // BatchedGeqrf performs computation in-place and 'qr' must be a copy of
+    // input
+    TensorCopy(x, context.GetPlace(), &qr);
+
+    // Prepare tau
+    auto tau_dims_vec = framework::vectorize<int>(x_dims);
+    tau_dims_vec.pop_back();
+    tau_dims_vec[tau_dims_vec.size() - 1] = min_mn;
+    Tensor tau = dito.Fill(tau_dims_vec, 0);
+
+    // Transpose 'qr' to conform the column-major order
+    auto tmp_qr = dito.Transpose(qr);
+    framework::TensorCopy(tmp_qr, qr.place(), &qr);
+    auto qr_data = qr.mutable_data<T>(context.GetPlace());
+    auto tau_data = tau.mutable_data<T>(context.GetPlace());
+
+    BatchedGeqrf(dev_ctx, batch_size, m, n, qr_data, m, tau_data, qr_stride,
+                 tau_stride);
+
+    if (reduced_mode) {
+      auto trans_qr = dito.Transpose(qr);
+      auto sliced_qr = dito.Slice(trans_qr, {-2}, {0}, {min_mn});
+      auto tmp_r = dito.TrilTriu(sliced_qr, 0, false);
+      // Transpose 'tmp_r' to retore the original row-major order
+      framework::TensorCopy(tmp_r, r.place(), &r);
+    } else {
+      auto trans_qr = dito.Transpose(qr);
+      auto tmp_r = dito.TrilTriu(trans_qr, 0, false);
+      // Transpose 'tmp_r' to retore the original row-major order
+      framework::TensorCopy(tmp_r, r.place(), &r);
+    }
+
+    if (compute_q) {
+      // Perform QRGQR for Q using the result from GEQRF
+      // Transpose 'q' to retore the original row-major order
+      if (reduced_mode) {
+        BatchedOrgqr(dev_ctx, batch_size, m, min_mn, min_mn, qr_data, m,
+                     tau_data, qr_stride, tau_stride);
+        auto trans_q = dito.Transpose(qr);
+        auto sliced_q = dito.Slice(trans_q, {-1}, {0}, {min_mn});
+        framework::TensorCopy(sliced_q, q.place(), &q);
+      } else {
+        if (m > n) {
+          auto new_qr_dims_vec = framework::vectorize<int>(x_dims);
+          new_qr_dims_vec[new_qr_dims_vec.size() - 1] = m;
+          Tensor new_qr = dito.Fill(new_qr_dims_vec, 0);
+          auto new_qr_data = new_qr.mutable_data<T>(context.GetPlace());
+          auto new_qr_stride = m * m;
+          for (int i = 0; i < batch_size; ++i) {
+            memory::Copy(
+                BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
+                (new_qr_data + i * new_qr_stride),
+                BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
+                (qr_data + i * qr_stride), qr_stride * sizeof(math::Real<T>),
+                dev_ctx.stream());
+          }
+          BatchedOrgqr(dev_ctx, batch_size, m, m, min_mn, new_qr_data, m,
+                       tau_data, new_qr_stride, tau_stride);
+          auto trans_q = dito.Transpose(new_qr);
+          framework::TensorCopy(trans_q, q.place(), &q);
+        } else {
+          BatchedOrgqr(dev_ctx, batch_size, m, m, min_mn, qr_data, m, tau_data,
+                       qr_stride, tau_stride);
+          auto trans_q = dito.Transpose(qr);
+          auto sliced_q = dito.Slice(trans_q, {-1}, {0}, {m});
+          framework::TensorCopy(sliced_q, q.place(), &q);
+        }
+      }
+    }
+  }
+
+  void BatchedGeqrf(const platform::CUDADeviceContext& dev_ctx, int batch_size,
+                    int m, int n, float* a, int lda, float* tau, int a_stride,
+                    int tau_stride) const;
+
+  void BatchedGeqrf(const platform::CUDADeviceContext& dev_ctx, int batch_size,
+                    int m, int n, double* a, int lda, double* tau, int a_stride,
+                    int tau_stride) const;
+
+  void BatchedOrgqr(const platform::CUDADeviceContext& dev_ctx, int batch_size,
+                    int m, int n, int k, float* a, int lda, float* tau,
+                    int a_stride, int tau_stride) const;
+
+  void BatchedOrgqr(const platform::CUDADeviceContext& dev_ctx, int batch_size,
+                    int m, int n, int k, double* a, int lda, double* tau,
+                    int a_stride, int tau_stride) const;
+};
+
+template <>
+void QrGPUKernel<float>::BatchedGeqrf(
+    const platform::CUDADeviceContext& dev_ctx, int batch_size, int m, int n,
+    float* a, int lda, float* tau, int a_stride, int tau_stride) const {
+  int lwork = 0;
+
+  auto handle = dev_ctx.cusolver_dn_handle();
+  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSgeqrf_bufferSize(
+      handle, m, n, a, lda, &lwork));
+  auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float));
+  float* workspace_ptr = reinterpret_cast<float*>(workspace->ptr());
+  auto info = memory::Alloc(dev_ctx, sizeof(int));
+  int* info_d = reinterpret_cast<int*>(info->ptr());
+
+  for (int i = 0; i < batch_size; ++i) {
+    float* a_working_ptr = &a[i * a_stride];
+    float* tau_working_ptr = &tau[i * tau_stride];
+    // compute geqrf
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSgeqrf(
+        handle, m, n, a_working_ptr, lda, tau_working_ptr, workspace_ptr, lwork,
+        info_d));
+    // Do we need synchronized here?
+    // check the error info
+    int info_h;
+    memory::Copy(platform::CPUPlace(), &info_h,
+                 BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
+                 info_d, sizeof(int), dev_ctx.stream());
+    PADDLE_ENFORCE_EQ(
+        info_h, 0,
+        platform::errors::PreconditionNotMet(
+            "For batch [%d]: CUSolver geqrf is not zero. [%d]", i, info_h));
+  }
+}
+
+template <>
+void QrGPUKernel<double>::BatchedGeqrf(
+    const platform::CUDADeviceContext& dev_ctx, int batch_size, int m, int n,
+    double* a, int lda, double* tau, int a_stride, int tau_stride) const {
+  int lwork = 0;
+
+  auto handle = dev_ctx.cusolver_dn_handle();
+  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDgeqrf_bufferSize(
+      handle, m, n, a, lda, &lwork));
+  auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double));
+  double* workspace_ptr = reinterpret_cast<double*>(workspace->ptr());
+  auto info = memory::Alloc(dev_ctx, sizeof(int));
+  int* info_d = reinterpret_cast<int*>(info->ptr());
+
+  for (int i = 0; i < batch_size; ++i) {
+    double* a_working_ptr = &a[i * a_stride];
+    double* tau_working_ptr = &tau[i * tau_stride];
+    // compute geqrf
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDgeqrf(
+        handle, m, n, a_working_ptr, lda, tau_working_ptr, workspace_ptr, lwork,
+        info_d));
+    // Do we need synchronized here?
+    // check the error info
+    int info_h;
+    memory::Copy(platform::CPUPlace(), &info_h,
+                 BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
+                 info_d, sizeof(int), dev_ctx.stream());
+    PADDLE_ENFORCE_EQ(
+        info_h, 0,
+        platform::errors::PreconditionNotMet(
+            "For batch [%d]: CUSolver geqrf is not zero. [%d]", i, info_h));
+  }
+}
+
+template <>
+void QrGPUKernel<float>::BatchedOrgqr(
+    const platform::CUDADeviceContext& dev_ctx, int batch_size, int m, int n,
+    int k, float* a, int lda, float* tau, int a_stride, int tau_stride) const {
+  int lwork = 0;
+
+  auto handle = dev_ctx.cusolver_dn_handle();
+  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSorgqr_bufferSize(
+      handle, m, n, k, a, lda, tau, &lwork));
+  auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float));
+  float* workspace_ptr = reinterpret_cast<float*>(workspace->ptr());
+  auto info = memory::Alloc(dev_ctx, sizeof(int));
+  int* info_d = reinterpret_cast<int*>(info->ptr());
+
+  for (int i = 0; i < batch_size; ++i) {
+    float* a_working_ptr = &a[i * a_stride];
+    float* tau_working_ptr = &tau[i * tau_stride];
+    // compute orggr
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSorgqr(
+        handle, m, n, k, a_working_ptr, lda, tau_working_ptr, workspace_ptr,
+        lwork, info_d));
+    // Do we need synchronized here?
+    // check the error info
+    int info_h;
+    memory::Copy(platform::CPUPlace(), &info_h,
+                 BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
+                 info_d, sizeof(int), dev_ctx.stream());
+    PADDLE_ENFORCE_EQ(
+        info_h, 0,
+        platform::errors::PreconditionNotMet(
+            "For batch [%d]: CUSolver QR is not zero. [%d]", i, info_h));
+  }
+}
+
+template <>
+void QrGPUKernel<double>::BatchedOrgqr(
+    const platform::CUDADeviceContext& dev_ctx, int batch_size, int m, int n,
+    int k, double* a, int lda, double* tau, int a_stride,
+    int tau_stride) const {
+  int lwork = 0;
+
+  auto handle = dev_ctx.cusolver_dn_handle();
+  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDorgqr_bufferSize(
+      handle, m, n, k, a, lda, tau, &lwork));
+  auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double));
+  double* workspace_ptr = reinterpret_cast<double*>(workspace->ptr());
+  auto info = memory::Alloc(dev_ctx, sizeof(int));
+  int* info_d = reinterpret_cast<int*>(info->ptr());
+
+  for (int i = 0; i < batch_size; ++i) {
+    double* a_working_ptr = &a[i * a_stride];
+    double* tau_working_ptr = &tau[i * tau_stride];
+    // compute orggr
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDorgqr(
+        handle, m, n, k, a_working_ptr, lda, tau_working_ptr, workspace_ptr,
+        lwork, info_d));
+    // Do we need synchronized here?
+    // check the error info
+    int info_h;
+    memory::Copy(platform::CPUPlace(), &info_h,
+                 BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
+                 info_d, sizeof(int), dev_ctx.stream());
+    PADDLE_ENFORCE_EQ(
+        info_h, 0,
+        platform::errors::PreconditionNotMet(
+            "For batch [%d]: CUSolver QR is not zero. [%d]", i, info_h));
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(qr, ops::QrGPUKernel<float>, ops::QrGPUKernel<double>);
+REGISTER_OP_CUDA_KERNEL(
+    qr_grad, ops::QrGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::QrGradKernel<paddle::platform::CUDADeviceContext, double>);
+
+#endif  // not PADDLE_WITH_HIP
--- a/paddle/fluid/operators/qr_op.h
+++ b/paddle/fluid/operators/qr_op.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <Eigen/Dense>
+#include <cstdarg>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/math/complex_functors.h"
+#include "paddle/fluid/platform/for_range.h"
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+
+static inline std::tuple<bool, bool> _parse_qr_mode(std::string mode) {
+  bool compute_q;
+  bool reduced;
+  if (mode == "reduced") {
+    compute_q = true;
+    reduced = true;
+  } else if (mode == "complete") {
+    compute_q = true;
+    reduced = false;
+  } else if (mode == "r") {
+    compute_q = false;
+    reduced = true;
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "QR received unrecognized mode '%s'"
+        " but expected one of 'reduced' (default), 'r', or 'complete'",
+        mode));
+  }
+  return std::make_tuple(compute_q, reduced);
+}
+
+template <typename T>
+class QrCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    bool compute_q;
+    bool reduced_mode;
+    const Tensor& x = *context.Input<Tensor>("X");
+    Tensor& q = *context.Output<Tensor>("Q");
+    Tensor& r = *context.Output<Tensor>("R");
+    std::string mode = context.Attr<std::string>("mode");
+    std::tie(compute_q, reduced_mode) = _parse_qr_mode(mode);
+
+    auto numel = x.numel();
+    PADDLE_ENFORCE_GT(numel, 0, platform::errors::PreconditionNotMet(
+                                    "The input of QR is empty."));
+    auto x_dims = x.dims();
+    int x_rank = x_dims.size();
+    int m = x_dims[x_rank - 2];
+    int n = x_dims[x_rank - 1];
+    int min_mn = std::min(m, n);
+    int k = reduced_mode ? min_mn : m;
+    int batch_size = numel / (m * n);
+    int x_stride = m * n;
+    int q_stride = m * k;
+    int r_stride = k * n;
+
+    auto* x_data = x.data<math::Real<T>>();
+    T* q_data = nullptr;
+    if (compute_q) {
+      q_data = q.mutable_data<math::Real<T>>(
+          context.GetPlace(),
+          size_t(batch_size * m * k * sizeof(math::Real<T>)));
+    }
+    auto* r_data = r.mutable_data<math::Real<T>>(
+        context.GetPlace(), size_t(batch_size * k * n * sizeof(math::Real<T>)));
+
+    // Implement QR by calling Eigen
+    for (int i = 0; i < batch_size; ++i) {
+      const T* x_matrix_ptr = x_data + i * x_stride;
+      T* r_matrix_ptr = r_data + i * r_stride;
+      using EigenDynamicMatrix =
+          Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+      auto x_matrix = Eigen::Map<const EigenDynamicMatrix>(x_matrix_ptr, m, n);
+      Eigen::HouseholderQR<EigenDynamicMatrix> qr(x_matrix);
+      if (reduced_mode) {
+        auto qr_top_matrix = qr.matrixQR().block(0, 0, min_mn, n);
+        auto r_matrix_view =
+            qr_top_matrix.template triangularView<Eigen::Upper>();
+        auto r_matrix = EigenDynamicMatrix(r_matrix_view);
+        memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T));
+      } else {
+        auto r_matrix_view =
+            qr.matrixQR().template triangularView<Eigen::Upper>();
+        auto r_matrix = EigenDynamicMatrix(r_matrix_view);
+        memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T));
+      }
+
+      if (compute_q) {
+        T* q_matrix_ptr = q_data + i * q_stride;
+        if (reduced_mode) {
+          auto q_matrix =
+              qr.householderQ() * EigenDynamicMatrix::Identity(m, min_mn);
+          q_matrix.transposeInPlace();
+          memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T));
+        } else {
+          auto q_matrix =
+              qr.householderQ() * EigenDynamicMatrix::Identity(m, m);
+          q_matrix.transposeInPlace();
+          memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T));
+        }
+      }
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class QrGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "QR doesn't have the backward kernel now and will be supported soon."));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/svd_helper.h
+++ b/paddle/fluid/operators/svd_helper.h
@@ -502,6 +502,19 @@ struct DeviceIndependenceTensorOperations {
    return ret;
  }

+  framework::Tensor TrilTriu(const framework::Tensor& x, int diagonal,
+                             bool lower) {
+    framework::AttributeMap attrs;
+    attrs["diagonal"] = diagonal;
+    attrs["lower"] = lower;
+    NameInTensorMap inputs({{"X", {&x}}});
+    int x_rank = x.dims().size();
+    PADDLE_ENFORCE_GE(x_rank, 2, platform::errors::InvalidArgument(
+                                     "Rank must be at least 2."));
+    std::vector<int> out_shape = framework::vectorize<int>(x.dims());
+    return CreateOpRunAndReturnTensor("tril_triu", inputs, attrs, out_shape);
+  }
+
  Tensor Conj(const Tensor& x) {
    Tensor out;
    auto* out_data = out.mutable_data<T>(x.dims(), context.GetPlace());

--- a/paddle/fluid/platform/dynload/cusolver.h
+++ b/paddle/fluid/platform/dynload/cusolver.h
@@ -65,11 +65,27 @@ CUSOLVER_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP);
  __macro(cusolverDnSpotrfBatched);       \
  __macro(cusolverDnDpotrfBatched);       \
  __macro(cusolverDnSgesvdj_bufferSize);  \
+  __macro(cusolverDnSgeqrf_bufferSize);   \
+  __macro(cusolverDnDgeqrf_bufferSize);   \
+  __macro(cusolverDnCgeqrf_bufferSize);   \
+  __macro(cusolverDnZgeqrf_bufferSize);   \
+  __macro(cusolverDnSorgqr_bufferSize);   \
+  __macro(cusolverDnDorgqr_bufferSize);   \
+  __macro(cusolverDnCungqr_bufferSize);   \
+  __macro(cusolverDnZungqr_bufferSize);   \
  __macro(cusolverDnDestroyGesvdjInfo);   \
  __macro(cusolverDnCreateGesvdjInfo);    \
  __macro(cusolverDnDgesvdj_bufferSize);  \
  __macro(cusolverDnSgesvdj);             \
-  __macro(cusolverDnDgesvdj);
+  __macro(cusolverDnDgesvdj);             \
+  __macro(cusolverDnSgeqrf);              \
+  __macro(cusolverDnDgeqrf);              \
+  __macro(cusolverDnCgeqrf);              \
+  __macro(cusolverDnZgeqrf);              \
+  __macro(cusolverDnSorgqr);              \
+  __macro(cusolverDnDorgqr);              \
+  __macro(cusolverDnCungqr);              \
+  __macro(cusolverDnZungqr);

 CUSOLVER_ROUTINE_EACH_R1(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP)
 #endif

--- a/python/paddle/fluid/tests/unittests/test_qr_op.py
+++ b/python/paddle/fluid/tests/unittests/test_qr_op.py
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import itertools
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+import paddle.fluid.core as core
+
+
+class TestQrAPI(unittest.TestCase):
+    def test_dygraph(self):
+        paddle.disable_static()
+
+        def run_qr_dygraph(shape, mode, dtype):
+            if dtype == "float32":
+                np_dtype = np.float32
+            elif dtype == "float64":
+                np_dtype = np.float64
+            a = np.random.rand(*shape).astype(np_dtype)
+            m = a.shape[-2]
+            n = a.shape[-1]
+            min_mn = min(m, n)
+            if mode == "reduced" or mode == "r":
+                k = min_mn
+            else:
+                k = m
+            np_q_shape = list(a.shape[:-2])
+            np_q_shape.extend([m, k])
+            np_r_shape = list(a.shape[:-2])
+            np_r_shape.extend([k, n])
+            np_q = np.zeros(np_q_shape).astype(np_dtype)
+            np_r = np.zeros(np_r_shape).astype(np_dtype)
+            batch_size = a.size // (a.shape[-1] * a.shape[-2])
+            for i in range(batch_size):
+                coord = np.unravel_index(i, a.shape[:-2])
+                if mode == "r":
+                    tmp_r = np.linalg.qr(a[coord], mode=mode)
+                    np_r[coord] = tmp_r
+                else:
+                    tmp_q, tmp_r = np.linalg.qr(a[coord], mode=mode)
+                    np_q[coord] = tmp_q
+                    np_r[coord] = tmp_r
+
+            x = paddle.to_tensor(a, dtype=dtype)
+            if mode == "r":
+                r = paddle.linalg.qr(x, mode=mode)
+                self.assertTrue(np.allclose(r, np_r, atol=1e-5))
+            else:
+                q, r = paddle.linalg.qr(x, mode=mode)
+                self.assertTrue(np.allclose(q, np_q, atol=1e-5))
+                self.assertTrue(np.allclose(r, np_r, atol=1e-5))
+
+        tensor_shapes = [
+            (3, 5),
+            (5, 5),
+            (5, 3),  # 2-dim Tensors 
+            (2, 3, 5),
+            (3, 5, 5),
+            (4, 5, 3),  # 3-dim Tensors
+            (2, 5, 3, 5),
+            (3, 5, 5, 5),
+            (4, 5, 5, 3)  # 4-dim Tensors
+        ]
+        modes = ["reduced", "complete", "r"]
+        dtypes = ["float32", "float64"]
+        for tensor_shape, mode, dtype in itertools.product(tensor_shapes, modes,
+                                                           dtypes):
+            run_qr_dygraph(tensor_shape, mode, dtype)
+
+    def test_static(self):
+        paddle.enable_static()
+
+        def run_qr_static(shape, mode, dtype):
+            if dtype == "float32":
+                np_dtype = np.float32
+            elif dtype == "float64":
+                np_dtype = np.float64
+            a = np.random.rand(*shape).astype(np_dtype)
+            m = a.shape[-2]
+            n = a.shape[-1]
+            min_mn = min(m, n)
+            if mode == "reduced" or mode == "r":
+                k = min_mn
+            else:
+                k = m
+            np_q_shape = list(a.shape[:-2])
+            np_q_shape.extend([m, k])
+            np_r_shape = list(a.shape[:-2])
+            np_r_shape.extend([k, n])
+            np_q = np.zeros(np_q_shape).astype(np_dtype)
+            np_r = np.zeros(np_r_shape).astype(np_dtype)
+            places = []
+            places = [fluid.CPUPlace()]
+            if core.is_compiled_with_cuda():
+                places.append(fluid.CUDAPlace(0))
+            for place in places:
+                with fluid.program_guard(fluid.Program(), fluid.Program()):
+                    batch_size = a.size // (a.shape[-1] * a.shape[-2])
+                    for i in range(batch_size):
+                        coord = np.unravel_index(i, a.shape[:-2])
+                        if mode == "r":
+                            tmp_r = np.linalg.qr(a[coord], mode=mode)
+                            np_r[coord] = tmp_r
+                        else:
+                            tmp_q, tmp_r = np.linalg.qr(a[coord], mode=mode)
+                            np_q[coord] = tmp_q
+                            np_r[coord] = tmp_r
+                    x = paddle.fluid.data(
+                        name="input", shape=shape, dtype=dtype)
+                    if mode == "r":
+                        r = paddle.linalg.qr(x, mode=mode)
+                        exe = fluid.Executor(place)
+                        fetches = exe.run(fluid.default_main_program(),
+                                          feed={"input": a},
+                                          fetch_list=[r])
+                        self.assertTrue(
+                            np.allclose(
+                                fetches[0], np_r, atol=1e-5))
+                    else:
+                        q, r = paddle.linalg.qr(x, mode=mode)
+                        exe = fluid.Executor(place)
+                        fetches = exe.run(fluid.default_main_program(),
+                                          feed={"input": a},
+                                          fetch_list=[q, r])
+                        self.assertTrue(
+                            np.allclose(
+                                fetches[0], np_q, atol=1e-5))
+                        self.assertTrue(
+                            np.allclose(
+                                fetches[1], np_r, atol=1e-5))
+
+        tensor_shapes = [
+            (3, 5),
+            (5, 5),
+            (5, 3),  # 2-dim Tensors 
+            (2, 3, 5),
+            (3, 5, 5),
+            (4, 5, 3),  # 3-dim Tensors
+            (2, 5, 3, 5),
+            (3, 5, 5, 5),
+            (4, 5, 5, 3)  # 4-dim Tensors
+        ]
+        modes = ["reduced", "complete", "r"]
+        dtypes = ["float32", "float64"]
+        for tensor_shape, mode, dtype in itertools.product(tensor_shapes, modes,
+                                                           dtypes):
+            run_qr_static(tensor_shape, mode, dtype)
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
--- a/python/paddle/linalg.py
+++ b/python/paddle/linalg.py
@@ -23,6 +23,7 @@ from .tensor.linalg import eigvals  # noqa: F401
 from .tensor.linalg import multi_dot  # noqa: F401
 from .tensor.linalg import matrix_rank
 from .tensor.linalg import svd
+from .tensor.linalg import qr
 from .tensor.linalg import eigh  # noqa: F401
 from .tensor.linalg import det
 from .tensor.linalg import slogdet
@@ -38,6 +39,7 @@ __all__ = [
    'multi_dot',
    'matrix_rank',
    'svd',
+    'qr',
    'matrix_power',
    'det',
    'slogdet',

--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -47,6 +47,7 @@ from .linalg import histogram  # noqa: F401
 from .linalg import mv  # noqa: F401
 from .linalg import eig  # noqa: F401
 from .linalg import matrix_power  # noqa: F401
+from .linalg import qr  # noqa: F401
 from .linalg import eigvals  # noqa: F401
 from .linalg import multi_dot  # noqa: F401
 from .linalg import svd  # noqa: F401
@@ -237,6 +238,7 @@ tensor_method_func  = [ #noqa
           'histogram',
           'mv',
           'matrix_power',
+           'qr',
           'eigvals',
           'abs',
           'acos',

--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -1594,6 +1594,70 @@ def matrix_power(x, n, name=None):
    return out


+def qr(x, mode="reduced", name=None):
+    r"""
+    Computes the QR decomposition of one matrix or batches of matrice (backward is unsupported now).
+
+    Args:
+        x (Tensor): The input tensor. Its shape should be `[..., M, N]`,
+            where ... is zero or more batch dimensions. M and N can be arbitrary
+            positive number. The data type of x should be float32 or float64. 
+        mode (str, optional): A flag to control the behavior of qr, the default is "reduced". 
+            Suppose x's shape is `[..., M, N]` and denoting `K = min(M, N)`:
+            If mode = "reduced", qr op will return reduced Q and R matrices, 
+            which means Q's shape is `[..., M, K]` and R's shape is `[..., K, N]`.
+            If mode = "complete", qr op will return complete Q and R matrices, 
+            which means Q's shape is `[..., M, M]` and R's shape is `[..., M, N]`.
+            If mode = "r", qr op will only return reduced R matrix, which means
+            R's shape is `[..., K, N]`.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+            
+    Returns:
+        If mode = "reduced" or mode = "complete", qr will return a two tensor-tuple, which represents Q and R. 
+        If mode = "r", qr will return a tensor which represents R.
+        
+    Examples:            
+        .. code-block:: python
+
+            import paddle 
+
+            x = paddle.to_tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]).astype('float64')
+            q, r = paddle.linalg.qr(x)
+            print (q)
+            print (r)
+
+            # Q = [[-0.16903085,  0.89708523],
+            #      [-0.50709255,  0.27602622],
+            #      [-0.84515425, -0.34503278]])
+
+            # R = [[-5.91607978, -7.43735744],
+            #      [ 0.        ,  0.82807867]])
+            
+            # one can verify : X = Q * R ;     
+    """
+    if in_dygraph_mode():
+        q, r = _C_ops.qr(x, 'mode', mode)
+        if mode == "r":
+            return r
+        else:
+            return q, r
+    check_variable_and_dtype(x, 'dtype', ['float32', 'float64'], 'qr')
+    check_type(mode, 'mode', str, 'qr')
+    helper = LayerHelper('qr', **locals())
+    q = helper.create_variable_for_type_inference(dtype=x.dtype)
+    r = helper.create_variable_for_type_inference(dtype=x.dtype)
+    attrs = dict()
+    attrs['mode'] = mode
+    helper.append_op(
+        type='qr', inputs={'X': [x]}, outputs={'Q': q,
+                                               'R': r}, attrs=attrs)
+    if mode == "r":
+        return r
+    else:
+        return q, r
+
+
 def eig(x, name=None):
    """
    This API performs the eigenvalue decomposition of a square matrix or a batch of square matrices.