未验证 提交 616ce203 编写于 作者: Y Yulong Ao 提交者: GitHub

[Cherry-pick] Add the forward QR operator (#36627)

上级 fced11bd
......@@ -185,6 +185,7 @@ function(op_library TARGET)
list(REMOVE_ITEM hip_srcs "cholesky_op.cu")
list(REMOVE_ITEM hip_srcs "matrix_rank_op.cu")
list(REMOVE_ITEM hip_srcs "svd_op.cu")
list(REMOVE_ITEM hip_srcs "qr_op.cu")
list(REMOVE_ITEM hip_srcs "eigh_op.cu")
list(REMOVE_ITEM hip_srcs "multinomial_op.cu")
list(REMOVE_ITEM hip_srcs "decode_jpeg_op.cu")
......
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/qr_op.h"
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/ddim.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace paddle {
namespace operators {
using DDim = framework::DDim;
class QrOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "qr");
OP_INOUT_CHECK(ctx->HasOutput("Q"), "Output", "Q", "qr");
OP_INOUT_CHECK(ctx->HasOutput("R"), "Output", "R", "qr");
auto x_dims = ctx->GetInputDim("X");
int x_rank = x_dims.size();
PADDLE_ENFORCE_GE(x_dims.size(), 2,
platform::errors::InvalidArgument(
"the rank of input must greater than 2"));
bool compute_q;
bool reduced_mode;
int m = x_dims[x_rank - 2];
int n = x_dims[x_rank - 1];
int min_mn = std::min(m, n);
std::string mode = ctx->Attrs().Get<std::string>("mode");
std::tie(compute_q, reduced_mode) = _parse_qr_mode(mode);
if (compute_q) {
int k = reduced_mode ? min_mn : m;
auto q_dims_vec = framework::vectorize(x_dims);
q_dims_vec[q_dims_vec.size() - 1] = k;
ctx->SetOutputDim("Q", framework::make_ddim(q_dims_vec));
} else {
ctx->SetOutputDim("Q", framework::make_ddim({0}));
}
int k = reduced_mode ? min_mn : m;
auto r_dims_vec = framework::vectorize(x_dims);
r_dims_vec[r_dims_vec.size() - 2] = k;
r_dims_vec[r_dims_vec.size() - 1] = n;
ctx->SetOutputDim("R", framework::make_ddim(r_dims_vec));
ctx->ShareLoD("X", /*->*/ "Q");
ctx->ShareLoD("X", /*->*/ "R");
}
};
class QrOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "(Tensor), The input tensor of qr op.");
AddOutput("Q", "(Tensor), The output Q tensor of qr op.");
AddOutput("R", "(Tensor), The output R tensor of qr op.");
AddAttr<std::string>(
"mode",
"(string, default \"reduced\"). "
"If mode is \"reduced\", Qr op will return reduced Q and R matrices. "
"If mode is \"complete\", Qr op will return complete Q and R matrices. "
"If mode is \"r\", Qr op will only return reduced R matrix.")
.SetDefault("reduced");
AddComment(R"DOC(
Qr Operator.
This operator is used to perform QR operation for batched matrics $X$.
$$Q, R = qr(X)$$
)DOC");
}
};
class QrGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Q")), "Input",
"Q@Grad", "QrGrad");
OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("R")), "Input",
"R@Grad", "QrGrad");
OP_INOUT_CHECK(ctx->HasInput("Q"), "Input", "Q", "QrGrad");
OP_INOUT_CHECK(ctx->HasInput("R"), "Input", "R", "QrGrad");
OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output",
"X@Grad", "QrGrad");
auto x_dims = ctx->GetInputDim(("X"));
ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
auto dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X");
return framework::OpKernelType(dtype, ctx.GetPlace());
}
};
template <typename T>
class QrGradMaker : public framework::SingleGradOpMaker<T> {
public:
using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
void Apply(GradOpPtr<T> retv) const override {
retv->SetType("qr_grad");
retv->SetInput(framework::GradVarName("Q"), this->OutputGrad("Q"));
retv->SetInput(framework::GradVarName("R"), this->OutputGrad("R"));
retv->SetInput("Q", this->Output("Q"));
retv->SetInput("R", this->Output("R"));
retv->SetInput("X", this->Input("X"));
retv->SetAttrMap(this->Attrs());
retv->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(qr, ops::QrOp, ops::QrOpMaker,
ops::QrGradMaker<paddle::framework::OpDesc>,
ops::QrGradMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(qr_grad, ops::QrGradOp);
REGISTER_OP_CPU_KERNEL(qr, ops::QrCPUKernel<float>, ops::QrCPUKernel<double>);
REGISTER_OP_CPU_KERNEL(
qr_grad, ops::QrGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::QrGradKernel<paddle::platform::CPUDeviceContext, double>);
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef PADDLE_WITH_HIP
// HIP not support cusolver
#include <thrust/device_vector.h>
#include <algorithm>
#include <vector>
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/operators/qr_op.h"
#include "paddle/fluid/platform/dynload/cusolver.h"
// Reuse some helper functions from svd
#include "paddle/fluid/operators/svd_helper.h"
namespace paddle {
namespace operators {
template <typename T>
class QrGPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
bool compute_q;
bool reduced_mode;
auto& dev_ctx =
context.template device_context<platform::CUDADeviceContext>();
const Tensor& x = *context.Input<Tensor>("X");
Tensor& q = *context.Output<Tensor>("Q");
Tensor& r = *context.Output<Tensor>("R");
const std::string mode = context.Attr<std::string>("mode");
std::tie(compute_q, reduced_mode) = _parse_qr_mode(mode);
auto numel = x.numel();
PADDLE_ENFORCE_GT(numel, 0, platform::errors::PreconditionNotMet(
"The input of QR is empty."));
auto x_dims = x.dims();
int x_rank = x_dims.size();
int m = x_dims[x_rank - 2];
int n = x_dims[x_rank - 1];
int min_mn = std::min(m, n);
int k = reduced_mode ? min_mn : m;
int batch_size = numel / (m * n);
int qr_stride = m * n;
int tau_stride = min_mn;
if (compute_q) {
q.mutable_data<math::Real<T>>(
context.GetPlace(),
size_t(batch_size * m * k * sizeof(math::Real<T>)));
}
r.mutable_data<math::Real<T>>(
context.GetPlace(), size_t(batch_size * k * n * sizeof(math::Real<T>)));
auto dito =
math::DeviceIndependenceTensorOperations<platform::CUDADeviceContext,
T>(context);
// Note: allocate temporary tensors because of lacking in-place operatios.
// Prepare qr
Tensor qr;
qr.mutable_data<math::Real<T>>(
context.GetPlace(), size_t(batch_size * m * n * sizeof(math::Real<T>)));
// BatchedGeqrf performs computation in-place and 'qr' must be a copy of
// input
TensorCopy(x, context.GetPlace(), &qr);
// Prepare tau
auto tau_dims_vec = framework::vectorize<int>(x_dims);
tau_dims_vec.pop_back();
tau_dims_vec[tau_dims_vec.size() - 1] = min_mn;
Tensor tau = dito.Fill(tau_dims_vec, 0);
// Transpose 'qr' to conform the column-major order
auto tmp_qr = dito.Transpose(qr);
framework::TensorCopy(tmp_qr, qr.place(), &qr);
auto qr_data = qr.mutable_data<T>(context.GetPlace());
auto tau_data = tau.mutable_data<T>(context.GetPlace());
BatchedGeqrf(dev_ctx, batch_size, m, n, qr_data, m, tau_data, qr_stride,
tau_stride);
if (reduced_mode) {
auto trans_qr = dito.Transpose(qr);
auto sliced_qr = dito.Slice(trans_qr, {-2}, {0}, {min_mn});
auto tmp_r = dito.TrilTriu(sliced_qr, 0, false);
// Transpose 'tmp_r' to retore the original row-major order
framework::TensorCopy(tmp_r, r.place(), &r);
} else {
auto trans_qr = dito.Transpose(qr);
auto tmp_r = dito.TrilTriu(trans_qr, 0, false);
// Transpose 'tmp_r' to retore the original row-major order
framework::TensorCopy(tmp_r, r.place(), &r);
}
if (compute_q) {
// Perform QRGQR for Q using the result from GEQRF
// Transpose 'q' to retore the original row-major order
if (reduced_mode) {
BatchedOrgqr(dev_ctx, batch_size, m, min_mn, min_mn, qr_data, m,
tau_data, qr_stride, tau_stride);
auto trans_q = dito.Transpose(qr);
auto sliced_q = dito.Slice(trans_q, {-1}, {0}, {min_mn});
framework::TensorCopy(sliced_q, q.place(), &q);
} else {
if (m > n) {
auto new_qr_dims_vec = framework::vectorize<int>(x_dims);
new_qr_dims_vec[new_qr_dims_vec.size() - 1] = m;
Tensor new_qr = dito.Fill(new_qr_dims_vec, 0);
auto new_qr_data = new_qr.mutable_data<T>(context.GetPlace());
auto new_qr_stride = m * m;
for (int i = 0; i < batch_size; ++i) {
memory::Copy(
BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
(new_qr_data + i * new_qr_stride),
BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
(qr_data + i * qr_stride), qr_stride * sizeof(math::Real<T>),
dev_ctx.stream());
}
BatchedOrgqr(dev_ctx, batch_size, m, m, min_mn, new_qr_data, m,
tau_data, new_qr_stride, tau_stride);
auto trans_q = dito.Transpose(new_qr);
framework::TensorCopy(trans_q, q.place(), &q);
} else {
BatchedOrgqr(dev_ctx, batch_size, m, m, min_mn, qr_data, m, tau_data,
qr_stride, tau_stride);
auto trans_q = dito.Transpose(qr);
auto sliced_q = dito.Slice(trans_q, {-1}, {0}, {m});
framework::TensorCopy(sliced_q, q.place(), &q);
}
}
}
}
void BatchedGeqrf(const platform::CUDADeviceContext& dev_ctx, int batch_size,
int m, int n, float* a, int lda, float* tau, int a_stride,
int tau_stride) const;
void BatchedGeqrf(const platform::CUDADeviceContext& dev_ctx, int batch_size,
int m, int n, double* a, int lda, double* tau, int a_stride,
int tau_stride) const;
void BatchedOrgqr(const platform::CUDADeviceContext& dev_ctx, int batch_size,
int m, int n, int k, float* a, int lda, float* tau,
int a_stride, int tau_stride) const;
void BatchedOrgqr(const platform::CUDADeviceContext& dev_ctx, int batch_size,
int m, int n, int k, double* a, int lda, double* tau,
int a_stride, int tau_stride) const;
};
template <>
void QrGPUKernel<float>::BatchedGeqrf(
const platform::CUDADeviceContext& dev_ctx, int batch_size, int m, int n,
float* a, int lda, float* tau, int a_stride, int tau_stride) const {
int lwork = 0;
auto handle = dev_ctx.cusolver_dn_handle();
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSgeqrf_bufferSize(
handle, m, n, a, lda, &lwork));
auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float));
float* workspace_ptr = reinterpret_cast<float*>(workspace->ptr());
auto info = memory::Alloc(dev_ctx, sizeof(int));
int* info_d = reinterpret_cast<int*>(info->ptr());
for (int i = 0; i < batch_size; ++i) {
float* a_working_ptr = &a[i * a_stride];
float* tau_working_ptr = &tau[i * tau_stride];
// compute geqrf
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSgeqrf(
handle, m, n, a_working_ptr, lda, tau_working_ptr, workspace_ptr, lwork,
info_d));
// Do we need synchronized here?
// check the error info
int info_h;
memory::Copy(platform::CPUPlace(), &info_h,
BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
info_d, sizeof(int), dev_ctx.stream());
PADDLE_ENFORCE_EQ(
info_h, 0,
platform::errors::PreconditionNotMet(
"For batch [%d]: CUSolver geqrf is not zero. [%d]", i, info_h));
}
}
template <>
void QrGPUKernel<double>::BatchedGeqrf(
const platform::CUDADeviceContext& dev_ctx, int batch_size, int m, int n,
double* a, int lda, double* tau, int a_stride, int tau_stride) const {
int lwork = 0;
auto handle = dev_ctx.cusolver_dn_handle();
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDgeqrf_bufferSize(
handle, m, n, a, lda, &lwork));
auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double));
double* workspace_ptr = reinterpret_cast<double*>(workspace->ptr());
auto info = memory::Alloc(dev_ctx, sizeof(int));
int* info_d = reinterpret_cast<int*>(info->ptr());
for (int i = 0; i < batch_size; ++i) {
double* a_working_ptr = &a[i * a_stride];
double* tau_working_ptr = &tau[i * tau_stride];
// compute geqrf
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDgeqrf(
handle, m, n, a_working_ptr, lda, tau_working_ptr, workspace_ptr, lwork,
info_d));
// Do we need synchronized here?
// check the error info
int info_h;
memory::Copy(platform::CPUPlace(), &info_h,
BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
info_d, sizeof(int), dev_ctx.stream());
PADDLE_ENFORCE_EQ(
info_h, 0,
platform::errors::PreconditionNotMet(
"For batch [%d]: CUSolver geqrf is not zero. [%d]", i, info_h));
}
}
template <>
void QrGPUKernel<float>::BatchedOrgqr(
const platform::CUDADeviceContext& dev_ctx, int batch_size, int m, int n,
int k, float* a, int lda, float* tau, int a_stride, int tau_stride) const {
int lwork = 0;
auto handle = dev_ctx.cusolver_dn_handle();
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSorgqr_bufferSize(
handle, m, n, k, a, lda, tau, &lwork));
auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float));
float* workspace_ptr = reinterpret_cast<float*>(workspace->ptr());
auto info = memory::Alloc(dev_ctx, sizeof(int));
int* info_d = reinterpret_cast<int*>(info->ptr());
for (int i = 0; i < batch_size; ++i) {
float* a_working_ptr = &a[i * a_stride];
float* tau_working_ptr = &tau[i * tau_stride];
// compute orggr
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSorgqr(
handle, m, n, k, a_working_ptr, lda, tau_working_ptr, workspace_ptr,
lwork, info_d));
// Do we need synchronized here?
// check the error info
int info_h;
memory::Copy(platform::CPUPlace(), &info_h,
BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
info_d, sizeof(int), dev_ctx.stream());
PADDLE_ENFORCE_EQ(
info_h, 0,
platform::errors::PreconditionNotMet(
"For batch [%d]: CUSolver QR is not zero. [%d]", i, info_h));
}
}
template <>
void QrGPUKernel<double>::BatchedOrgqr(
const platform::CUDADeviceContext& dev_ctx, int batch_size, int m, int n,
int k, double* a, int lda, double* tau, int a_stride,
int tau_stride) const {
int lwork = 0;
auto handle = dev_ctx.cusolver_dn_handle();
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDorgqr_bufferSize(
handle, m, n, k, a, lda, tau, &lwork));
auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double));
double* workspace_ptr = reinterpret_cast<double*>(workspace->ptr());
auto info = memory::Alloc(dev_ctx, sizeof(int));
int* info_d = reinterpret_cast<int*>(info->ptr());
for (int i = 0; i < batch_size; ++i) {
double* a_working_ptr = &a[i * a_stride];
double* tau_working_ptr = &tau[i * tau_stride];
// compute orggr
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDorgqr(
handle, m, n, k, a_working_ptr, lda, tau_working_ptr, workspace_ptr,
lwork, info_d));
// Do we need synchronized here?
// check the error info
int info_h;
memory::Copy(platform::CPUPlace(), &info_h,
BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
info_d, sizeof(int), dev_ctx.stream());
PADDLE_ENFORCE_EQ(
info_h, 0,
platform::errors::PreconditionNotMet(
"For batch [%d]: CUSolver QR is not zero. [%d]", i, info_h));
}
}
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(qr, ops::QrGPUKernel<float>, ops::QrGPUKernel<double>);
REGISTER_OP_CUDA_KERNEL(
qr_grad, ops::QrGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::QrGradKernel<paddle::platform::CUDADeviceContext, double>);
#endif // not PADDLE_WITH_HIP
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <Eigen/Dense>
#include <cstdarg>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/math/complex_functors.h"
#include "paddle/fluid/platform/for_range.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using DDim = framework::DDim;
static inline std::tuple<bool, bool> _parse_qr_mode(std::string mode) {
bool compute_q;
bool reduced;
if (mode == "reduced") {
compute_q = true;
reduced = true;
} else if (mode == "complete") {
compute_q = true;
reduced = false;
} else if (mode == "r") {
compute_q = false;
reduced = true;
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"QR received unrecognized mode '%s'"
" but expected one of 'reduced' (default), 'r', or 'complete'",
mode));
}
return std::make_tuple(compute_q, reduced);
}
template <typename T>
class QrCPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
bool compute_q;
bool reduced_mode;
const Tensor& x = *context.Input<Tensor>("X");
Tensor& q = *context.Output<Tensor>("Q");
Tensor& r = *context.Output<Tensor>("R");
std::string mode = context.Attr<std::string>("mode");
std::tie(compute_q, reduced_mode) = _parse_qr_mode(mode);
auto numel = x.numel();
PADDLE_ENFORCE_GT(numel, 0, platform::errors::PreconditionNotMet(
"The input of QR is empty."));
auto x_dims = x.dims();
int x_rank = x_dims.size();
int m = x_dims[x_rank - 2];
int n = x_dims[x_rank - 1];
int min_mn = std::min(m, n);
int k = reduced_mode ? min_mn : m;
int batch_size = numel / (m * n);
int x_stride = m * n;
int q_stride = m * k;
int r_stride = k * n;
auto* x_data = x.data<math::Real<T>>();
T* q_data = nullptr;
if (compute_q) {
q_data = q.mutable_data<math::Real<T>>(
context.GetPlace(),
size_t(batch_size * m * k * sizeof(math::Real<T>)));
}
auto* r_data = r.mutable_data<math::Real<T>>(
context.GetPlace(), size_t(batch_size * k * n * sizeof(math::Real<T>)));
// Implement QR by calling Eigen
for (int i = 0; i < batch_size; ++i) {
const T* x_matrix_ptr = x_data + i * x_stride;
T* r_matrix_ptr = r_data + i * r_stride;
using EigenDynamicMatrix =
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
auto x_matrix = Eigen::Map<const EigenDynamicMatrix>(x_matrix_ptr, m, n);
Eigen::HouseholderQR<EigenDynamicMatrix> qr(x_matrix);
if (reduced_mode) {
auto qr_top_matrix = qr.matrixQR().block(0, 0, min_mn, n);
auto r_matrix_view =
qr_top_matrix.template triangularView<Eigen::Upper>();
auto r_matrix = EigenDynamicMatrix(r_matrix_view);
memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T));
} else {
auto r_matrix_view =
qr.matrixQR().template triangularView<Eigen::Upper>();
auto r_matrix = EigenDynamicMatrix(r_matrix_view);
memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T));
}
if (compute_q) {
T* q_matrix_ptr = q_data + i * q_stride;
if (reduced_mode) {
auto q_matrix =
qr.householderQ() * EigenDynamicMatrix::Identity(m, min_mn);
q_matrix.transposeInPlace();
memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T));
} else {
auto q_matrix =
qr.householderQ() * EigenDynamicMatrix::Identity(m, m);
q_matrix.transposeInPlace();
memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T));
}
}
}
}
};
template <typename DeviceContext, typename T>
class QrGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const {
PADDLE_THROW(platform::errors::InvalidArgument(
"QR doesn't have the backward kernel now and will be supported soon."));
}
};
} // namespace operators
} // namespace paddle
......@@ -502,6 +502,19 @@ struct DeviceIndependenceTensorOperations {
return ret;
}
framework::Tensor TrilTriu(const framework::Tensor& x, int diagonal,
bool lower) {
framework::AttributeMap attrs;
attrs["diagonal"] = diagonal;
attrs["lower"] = lower;
NameInTensorMap inputs({{"X", {&x}}});
int x_rank = x.dims().size();
PADDLE_ENFORCE_GE(x_rank, 2, platform::errors::InvalidArgument(
"Rank must be at least 2."));
std::vector<int> out_shape = framework::vectorize<int>(x.dims());
return CreateOpRunAndReturnTensor("tril_triu", inputs, attrs, out_shape);
}
Tensor Conj(const Tensor& x) {
Tensor out;
auto* out_data = out.mutable_data<T>(x.dims(), context.GetPlace());
......
......@@ -65,11 +65,27 @@ CUSOLVER_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP);
__macro(cusolverDnSpotrfBatched); \
__macro(cusolverDnDpotrfBatched); \
__macro(cusolverDnSgesvdj_bufferSize); \
__macro(cusolverDnSgeqrf_bufferSize); \
__macro(cusolverDnDgeqrf_bufferSize); \
__macro(cusolverDnCgeqrf_bufferSize); \
__macro(cusolverDnZgeqrf_bufferSize); \
__macro(cusolverDnSorgqr_bufferSize); \
__macro(cusolverDnDorgqr_bufferSize); \
__macro(cusolverDnCungqr_bufferSize); \
__macro(cusolverDnZungqr_bufferSize); \
__macro(cusolverDnDestroyGesvdjInfo); \
__macro(cusolverDnCreateGesvdjInfo); \
__macro(cusolverDnDgesvdj_bufferSize); \
__macro(cusolverDnSgesvdj); \
__macro(cusolverDnDgesvdj);
__macro(cusolverDnDgesvdj); \
__macro(cusolverDnSgeqrf); \
__macro(cusolverDnDgeqrf); \
__macro(cusolverDnCgeqrf); \
__macro(cusolverDnZgeqrf); \
__macro(cusolverDnSorgqr); \
__macro(cusolverDnDorgqr); \
__macro(cusolverDnCungqr); \
__macro(cusolverDnZungqr);
CUSOLVER_ROUTINE_EACH_R1(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP)
#endif
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import itertools
import numpy as np
import paddle
import paddle.fluid as fluid
import paddle.fluid.layers as layers
import paddle.fluid.core as core
class TestQrAPI(unittest.TestCase):
def test_dygraph(self):
paddle.disable_static()
def run_qr_dygraph(shape, mode, dtype):
if dtype == "float32":
np_dtype = np.float32
elif dtype == "float64":
np_dtype = np.float64
a = np.random.rand(*shape).astype(np_dtype)
m = a.shape[-2]
n = a.shape[-1]
min_mn = min(m, n)
if mode == "reduced" or mode == "r":
k = min_mn
else:
k = m
np_q_shape = list(a.shape[:-2])
np_q_shape.extend([m, k])
np_r_shape = list(a.shape[:-2])
np_r_shape.extend([k, n])
np_q = np.zeros(np_q_shape).astype(np_dtype)
np_r = np.zeros(np_r_shape).astype(np_dtype)
batch_size = a.size // (a.shape[-1] * a.shape[-2])
for i in range(batch_size):
coord = np.unravel_index(i, a.shape[:-2])
if mode == "r":
tmp_r = np.linalg.qr(a[coord], mode=mode)
np_r[coord] = tmp_r
else:
tmp_q, tmp_r = np.linalg.qr(a[coord], mode=mode)
np_q[coord] = tmp_q
np_r[coord] = tmp_r
x = paddle.to_tensor(a, dtype=dtype)
if mode == "r":
r = paddle.linalg.qr(x, mode=mode)
self.assertTrue(np.allclose(r, np_r, atol=1e-5))
else:
q, r = paddle.linalg.qr(x, mode=mode)
self.assertTrue(np.allclose(q, np_q, atol=1e-5))
self.assertTrue(np.allclose(r, np_r, atol=1e-5))
tensor_shapes = [
(3, 5),
(5, 5),
(5, 3), # 2-dim Tensors
(2, 3, 5),
(3, 5, 5),
(4, 5, 3), # 3-dim Tensors
(2, 5, 3, 5),
(3, 5, 5, 5),
(4, 5, 5, 3) # 4-dim Tensors
]
modes = ["reduced", "complete", "r"]
dtypes = ["float32", "float64"]
for tensor_shape, mode, dtype in itertools.product(tensor_shapes, modes,
dtypes):
run_qr_dygraph(tensor_shape, mode, dtype)
def test_static(self):
paddle.enable_static()
def run_qr_static(shape, mode, dtype):
if dtype == "float32":
np_dtype = np.float32
elif dtype == "float64":
np_dtype = np.float64
a = np.random.rand(*shape).astype(np_dtype)
m = a.shape[-2]
n = a.shape[-1]
min_mn = min(m, n)
if mode == "reduced" or mode == "r":
k = min_mn
else:
k = m
np_q_shape = list(a.shape[:-2])
np_q_shape.extend([m, k])
np_r_shape = list(a.shape[:-2])
np_r_shape.extend([k, n])
np_q = np.zeros(np_q_shape).astype(np_dtype)
np_r = np.zeros(np_r_shape).astype(np_dtype)
places = []
places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0))
for place in places:
with fluid.program_guard(fluid.Program(), fluid.Program()):
batch_size = a.size // (a.shape[-1] * a.shape[-2])
for i in range(batch_size):
coord = np.unravel_index(i, a.shape[:-2])
if mode == "r":
tmp_r = np.linalg.qr(a[coord], mode=mode)
np_r[coord] = tmp_r
else:
tmp_q, tmp_r = np.linalg.qr(a[coord], mode=mode)
np_q[coord] = tmp_q
np_r[coord] = tmp_r
x = paddle.fluid.data(
name="input", shape=shape, dtype=dtype)
if mode == "r":
r = paddle.linalg.qr(x, mode=mode)
exe = fluid.Executor(place)
fetches = exe.run(fluid.default_main_program(),
feed={"input": a},
fetch_list=[r])
self.assertTrue(
np.allclose(
fetches[0], np_r, atol=1e-5))
else:
q, r = paddle.linalg.qr(x, mode=mode)
exe = fluid.Executor(place)
fetches = exe.run(fluid.default_main_program(),
feed={"input": a},
fetch_list=[q, r])
self.assertTrue(
np.allclose(
fetches[0], np_q, atol=1e-5))
self.assertTrue(
np.allclose(
fetches[1], np_r, atol=1e-5))
tensor_shapes = [
(3, 5),
(5, 5),
(5, 3), # 2-dim Tensors
(2, 3, 5),
(3, 5, 5),
(4, 5, 3), # 3-dim Tensors
(2, 5, 3, 5),
(3, 5, 5, 5),
(4, 5, 5, 3) # 4-dim Tensors
]
modes = ["reduced", "complete", "r"]
dtypes = ["float32", "float64"]
for tensor_shape, mode, dtype in itertools.product(tensor_shapes, modes,
dtypes):
run_qr_static(tensor_shape, mode, dtype)
if __name__ == "__main__":
paddle.enable_static()
unittest.main()
......@@ -23,6 +23,7 @@ from .tensor.linalg import eigvals # noqa: F401
from .tensor.linalg import multi_dot # noqa: F401
from .tensor.linalg import matrix_rank
from .tensor.linalg import svd
from .tensor.linalg import qr
from .tensor.linalg import eigh # noqa: F401
from .tensor.linalg import det
from .tensor.linalg import slogdet
......@@ -38,6 +39,7 @@ __all__ = [
'multi_dot',
'matrix_rank',
'svd',
'qr',
'matrix_power',
'det',
'slogdet',
......
......@@ -47,6 +47,7 @@ from .linalg import histogram # noqa: F401
from .linalg import mv # noqa: F401
from .linalg import eig # noqa: F401
from .linalg import matrix_power # noqa: F401
from .linalg import qr # noqa: F401
from .linalg import eigvals # noqa: F401
from .linalg import multi_dot # noqa: F401
from .linalg import svd # noqa: F401
......@@ -237,6 +238,7 @@ tensor_method_func = [ #noqa
'histogram',
'mv',
'matrix_power',
'qr',
'eigvals',
'abs',
'acos',
......
......@@ -1594,6 +1594,70 @@ def matrix_power(x, n, name=None):
return out
def qr(x, mode="reduced", name=None):
r"""
Computes the QR decomposition of one matrix or batches of matrice (backward is unsupported now).
Args:
x (Tensor): The input tensor. Its shape should be `[..., M, N]`,
where ... is zero or more batch dimensions. M and N can be arbitrary
positive number. The data type of x should be float32 or float64.
mode (str, optional): A flag to control the behavior of qr, the default is "reduced".
Suppose x's shape is `[..., M, N]` and denoting `K = min(M, N)`:
If mode = "reduced", qr op will return reduced Q and R matrices,
which means Q's shape is `[..., M, K]` and R's shape is `[..., K, N]`.
If mode = "complete", qr op will return complete Q and R matrices,
which means Q's shape is `[..., M, M]` and R's shape is `[..., M, N]`.
If mode = "r", qr op will only return reduced R matrix, which means
R's shape is `[..., K, N]`.
name (str, optional): Name for the operation (optional, default is None).
For more information, please refer to :ref:`api_guide_Name`.
Returns:
If mode = "reduced" or mode = "complete", qr will return a two tensor-tuple, which represents Q and R.
If mode = "r", qr will return a tensor which represents R.
Examples:
.. code-block:: python
import paddle
x = paddle.to_tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]).astype('float64')
q, r = paddle.linalg.qr(x)
print (q)
print (r)
# Q = [[-0.16903085, 0.89708523],
# [-0.50709255, 0.27602622],
# [-0.84515425, -0.34503278]])
# R = [[-5.91607978, -7.43735744],
# [ 0. , 0.82807867]])
# one can verify : X = Q * R ;
"""
if in_dygraph_mode():
q, r = _C_ops.qr(x, 'mode', mode)
if mode == "r":
return r
else:
return q, r
check_variable_and_dtype(x, 'dtype', ['float32', 'float64'], 'qr')
check_type(mode, 'mode', str, 'qr')
helper = LayerHelper('qr', **locals())
q = helper.create_variable_for_type_inference(dtype=x.dtype)
r = helper.create_variable_for_type_inference(dtype=x.dtype)
attrs = dict()
attrs['mode'] = mode
helper.append_op(
type='qr', inputs={'X': [x]}, outputs={'Q': q,
'R': r}, attrs=attrs)
if mode == "r":
return r
else:
return q, r
def eig(x, name=None):
"""
This API performs the eigenvalue decomposition of a square matrix or a batch of square matrices.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册