未验证 提交 39f7c41f 编写于 作者: Z zhiboniu 提交者: GitHub

Add new API cholesky_solve (#38167)

上级 bcf86e5c
......@@ -196,6 +196,7 @@ function(op_library TARGET)
list(REMOVE_ITEM miopen_cu_cc_srcs "affine_grid_cudnn_op.cu.cc")
list(REMOVE_ITEM miopen_cu_cc_srcs "grid_sampler_cudnn_op.cu.cc")
list(REMOVE_ITEM hip_srcs "cholesky_op.cu")
list(REMOVE_ITEM hip_srcs "cholesky_solve_op.cu")
list(REMOVE_ITEM hip_srcs "matrix_rank_op.cu")
list(REMOVE_ITEM hip_srcs "svd_op.cu")
list(REMOVE_ITEM hip_srcs "eigvalsh_op.cu")
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/cholesky_solve_op.h"
#include "paddle/fluid/operators/solve_op.h"
namespace paddle {
namespace operators {
class CholeskySolveOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddComment(R"DOC(Solves a linear system of equations with a positive "
"semidefinite matrix to be inverted given its Cholesky factor matrix uu."
")DOC");
AddInput("X", "(Tensor) The input tensor, shape of (*,m,k)");
AddInput("Y",
"(Tensor) The input tensor, shape of (*,m,m) composed of upper or "
"lower triangular Cholesky factor");
AddOutput("Out", "(Tensor) The output tensor, shape same to X");
AddAttr<bool>("upper",
"whether to consider the Cholesky factor "
"as a lower or upper triangular matrix")
.SetDefault(false);
}
};
class CholeskySolveOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *context) const override {
OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "CholeskySolve");
OP_INOUT_CHECK(context->HasInput("Y"), "Input", "Y", "CholeskySolve");
OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "CholeskySolve");
auto u_dims = context->GetInputDim("Y");
auto b_dims = context->GetInputDim("X");
int u_rank = u_dims.size();
int b_rank = b_dims.size();
PADDLE_ENFORCE_GE(u_rank, 2,
platform::errors::InvalidArgument(
"the rank of input Y must greater or equal to 2"));
PADDLE_ENFORCE_GE(b_rank, 2,
platform::errors::InvalidArgument(
"the rank of input X must greater or equal to 2"));
PADDLE_ENFORCE_EQ(u_dims[u_rank - 1], u_dims[u_rank - 2],
platform::errors::InvalidArgument(
"input Matrix Y should be square matrix,"
"But Got last shape of %ld x %ld",
u_dims[u_rank - 1], u_dims[u_rank - 2]));
PADDLE_ENFORCE_EQ(
b_dims[b_rank - 2], u_dims[u_rank - 2],
platform::errors::InvalidArgument(
"the first dim of input X must equal to the dim of input Y,"
"But Got %ld and %ld",
b_dims[b_rank - 2], u_dims[u_rank - 2]));
std::vector<int64_t> u_dims_vec = paddle::framework::vectorize(u_dims);
std::vector<int64_t> b_dims_vec = paddle::framework::vectorize(b_dims);
std::vector<int64_t> u_dims_vec_cut(u_dims_vec.begin(),
u_dims_vec.end() - 2);
std::vector<int64_t> b_dims_vec_cut(b_dims_vec.begin(),
b_dims_vec.end() - 2);
std::vector<int64_t> expand_batch_portion =
get_broadcast_batch_portion(u_dims_vec_cut, b_dims_vec_cut);
std::vector<int64_t> b_broadcast_dims({expand_batch_portion});
b_broadcast_dims.insert(b_broadcast_dims.end(),
{b_dims_vec[b_rank - 2], b_dims_vec[b_rank - 1]});
// dim of 'Out' is the same with 'Y' after broadcast
context->SetOutputDim("Out", framework::make_ddim(b_broadcast_dims));
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType(
OperatorWithKernel::IndicateVarDataType(ctx, "Y"), ctx.GetPlace());
}
};
class CholeskySolveOpVarTypeInference : public framework::VarTypeInference {
public:
void operator()(framework::InferVarTypeContext *ctx) const override {
auto var_type = ctx->GetInputType("Y", 0);
auto data_type = ctx->GetInputDataType("Y", 0);
ctx->SetOutputType("Out", var_type, framework::ALL_ELEMENTS);
ctx->SetOutputDataType("Out", data_type, framework::ALL_ELEMENTS);
}
};
template <typename T>
class CholeskySolveOpGradMaker : public framework::SingleGradOpMaker<T> {
public:
using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
protected:
void Apply(GradOpPtr<T> retv) const override {
retv->SetType("cholesky_solve_grad");
retv->SetInput("X", this->Input("X"));
retv->SetInput("Y", this->Input("Y"));
retv->SetInput("Out", this->Output("Out"));
retv->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
retv->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
retv->SetOutput(framework::GradVarName("Y"), this->InputGrad("Y"));
retv->SetAttrMap(this->Attrs());
}
};
class CholeskySolveGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "cholesky_solve");
OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "cholesky_solve");
OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "cholesky_solve");
OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
"Out@GRAD", "cholesky_solve");
auto x_dims = ctx->GetInputDim("X");
auto y_dims = ctx->GetInputDim("Y");
auto x_grad_name = framework::GradVarName("X");
auto y_grad_name = framework::GradVarName("Y");
if (ctx->HasOutput(x_grad_name)) {
ctx->SetOutputDim(x_grad_name, x_dims);
}
if (ctx->HasOutput(y_grad_name)) {
ctx->SetOutputDim(y_grad_name, y_dims);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(cholesky_solve, ops::CholeskySolveOp,
ops::CholeskySolveOpMaker,
ops::CholeskySolveOpVarTypeInference,
ops::CholeskySolveOpGradMaker<paddle::framework::OpDesc>,
ops::CholeskySolveOpGradMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(cholesky_solve_grad, ops::CholeskySolveGradOp);
REGISTER_OP_CPU_KERNEL(
cholesky_solve,
ops::CholeskySolveKernel<paddle::platform::CPUDeviceContext, float>,
ops::CholeskySolveKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
cholesky_solve_grad,
ops::CholeskySolveGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::CholeskySolveGradKernel<paddle::platform::CPUDeviceContext, double>);
// Complex<> is not supported because of TensorExpand, which used to boardcast
// input Tensor
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef PADDLE_WITH_HIP
// HIP not support cusolver
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/operators/cholesky_solve_op.h"
#include "paddle/fluid/platform/dynload/cusolver.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using CUDADeviceContext = paddle::platform::CUDADeviceContext;
template <typename T>
void cusolver_potrs(const cusolverDnHandle_t &cusolverH, cublasFillMode_t uplo,
int n, int nrhs, T *Adata, int lda, T *Bdata, int ldb,
int *devInfo);
template <>
void cusolver_potrs<float>(const cusolverDnHandle_t &cusolverH,
cublasFillMode_t uplo, int n, int nrhs, float *Adata,
int lda, float *Bdata, int ldb, int *devInfo) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSpotrs(
cusolverH, uplo, n, nrhs, Adata, lda, Bdata, ldb, devInfo));
}
template <>
void cusolver_potrs<double>(const cusolverDnHandle_t &cusolverH,
cublasFillMode_t uplo, int n, int nrhs,
double *Adata, int lda, double *Bdata, int ldb,
int *devInfo) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDpotrs(
cusolverH, uplo, n, nrhs, Adata, lda, Bdata, ldb, devInfo));
}
template <>
void cusolver_potrs<platform::complex<float>>(
const cusolverDnHandle_t &cusolverH, cublasFillMode_t uplo, int n, int nrhs,
platform::complex<float> *Adata, int lda, platform::complex<float> *Bdata,
int ldb, int *devInfo) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnCpotrs(
cusolverH, uplo, n, nrhs, reinterpret_cast<const cuComplex *>(Adata), lda,
reinterpret_cast<cuComplex *>(Bdata), ldb, devInfo));
}
template <>
void cusolver_potrs<platform::complex<double>>(
const cusolverDnHandle_t &cusolverH, cublasFillMode_t uplo, int n, int nrhs,
platform::complex<double> *Adata, int lda, platform::complex<double> *Bdata,
int ldb, int *devInfo) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnZpotrs(
cusolverH, uplo, n, nrhs,
reinterpret_cast<const cuDoubleComplex *>(Adata), lda,
reinterpret_cast<cuDoubleComplex *>(Bdata), ldb, devInfo));
}
template <typename T>
class CholeskySolveFunctor<paddle::platform::CUDADeviceContext, T> {
public:
void operator()(const platform::CUDADeviceContext &dev_ctx, bool upper, int n,
int nrhs, T *Adata, int lda, T *Bdata, int *devInfo) {
cublasFillMode_t uplo =
upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
/* step 1: get cusolver handle*/
auto cusolverH = dev_ctx.cusolver_dn_handle();
/* step 2: solve A0*X0 = B0 */
cusolver_potrs<T>(cusolverH, uplo, n, nrhs, Adata, lda, Bdata, lda,
devInfo);
PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
}
};
template <typename T>
class MatrixReduceSumFunctor<platform::CUDADeviceContext, T> {
public:
void operator()(const Tensor &in, Tensor *out,
const framework::ExecutionContext &ctx) {
// For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3]
// out_reduce_dim should be [0, 2]
const std::vector<std::int64_t> in_dims = framework::vectorize(in.dims());
auto in_size = in_dims.size();
const std::vector<std::int64_t> out_dims =
framework::vectorize(out->dims());
auto out_size = out_dims.size();
std::vector<std::int64_t> out_bst_dims(in_size);
std::fill(out_bst_dims.data(), out_bst_dims.data() + in_size - out_size, 1);
std::copy(out_dims.data(), out_dims.data() + out_size,
out_bst_dims.data() + in_size - out_size);
std::vector<int> out_reduce_dims;
for (size_t idx = 0; idx <= in_size - 3; idx++) {
if (in_dims[idx] != 1 && out_bst_dims[idx] == 1) {
out_reduce_dims.push_back(idx);
}
}
gpuStream_t stream = ctx.cuda_device_context().stream();
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
in, out, kps::IdentityFunctor<T>(), out_reduce_dims, stream);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
cholesky_solve,
ops::CholeskySolveKernel<paddle::platform::CUDADeviceContext, float>,
ops::CholeskySolveKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
cholesky_solve_grad,
ops::CholeskySolveGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::CholeskySolveGradKernel<paddle::platform::CUDADeviceContext, double>);
#endif // not PADDLE_WITH_HIP
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/lapack_function.h"
#include "paddle/fluid/operators/solve_op.h"
#include "paddle/fluid/operators/svd_helper.h"
#include "paddle/fluid/operators/triangular_solve_op.h"
#include "paddle/fluid/platform/complex.h"
#include "paddle/pten/include/math.h"
namespace paddle {
namespace operators { // namespace operators
template <typename DeviceContext, typename T>
class CholeskySolveFunctor {
public:
void operator()(const platform::DeviceContext &dev_ctx, bool upper, int n,
int nrhs, T *Adata, int lda, T *Bdata, int *devInfo);
};
template <typename T>
class CholeskySolveFunctor<paddle::platform::CPUDeviceContext, T> {
public:
void operator()(const platform::CPUDeviceContext &dev_ctx, bool upper, int n,
int nrhs, T *Adata, int lda, T *Bdata, int *devInfo) {
char uplo = upper ? 'U' : 'L';
math::lapackCholeskySolve<T>(uplo, n, nrhs, Adata, lda, Bdata, lda,
devInfo);
}
};
template <typename DeviceContext, typename T>
void cholesky_solve_fn(const paddle::framework::ExecutionContext &ctx,
const framework::Tensor &uin,
const framework::Tensor &bin, framework::Tensor *out,
bool upper) {
const auto &dev_ctx = ctx.template device_context<DeviceContext>();
// framework::Tensor broadcast
std::vector<int64_t> u_bst_dims_vec;
std::vector<int64_t> b_bst_dims_vec;
std::tie(u_bst_dims_vec, b_bst_dims_vec) = get_broadcast_dims(uin, bin);
framework::Tensor u_bst(uin.type());
TensorExpand<T, DeviceContext>(dev_ctx, uin, &u_bst, u_bst_dims_vec);
framework::Tensor b_bst(bin.type());
TensorExpand<T, DeviceContext>(dev_ctx, bin, &b_bst, b_bst_dims_vec);
math::DeviceIndependenceTensorOperations<DeviceContext, T> helper(ctx);
// calculate u's conjugate for complex
framework::Tensor u_conj(u_bst.type());
platform::ForRange<DeviceContext> u_for_range(dev_ctx, u_bst.numel());
math::ConjFunctor<T> u_functor(
u_bst.data<T>(), u_bst.numel(),
u_conj.mutable_data<T>(u_bst.dims(), dev_ctx.GetPlace()));
u_for_range(u_functor);
u_conj = helper.Transpose(u_conj);
// calculate b's conjugate for complex
framework::Tensor b_conj(b_bst.type());
platform::ForRange<DeviceContext> b_for_range(dev_ctx, b_bst.numel());
math::ConjFunctor<T> b_functor(
b_bst.data<T>(), b_bst.numel(),
b_conj.mutable_data<T>(b_bst.dims(), dev_ctx.GetPlace()));
b_for_range(b_functor);
b_conj = helper.Transpose(b_conj);
auto ut_data = u_conj.mutable_data<T>(dev_ctx.GetPlace());
auto uindims = u_bst.dims();
auto bindims = b_bst.dims();
int uinrank = uindims.size();
int binrank = bindims.size();
int n = uindims[uinrank - 2];
int nrhs = bindims[binrank - 1];
int ldab = std::max(1, n);
// framework::Tensor out_copy(b_conj.type());
// out_copy.Resize(b_conj.dims());
framework::TensorCopy(b_conj, dev_ctx.GetPlace(), out);
T *out_data = out->mutable_data<T>(dev_ctx.GetPlace());
auto info_dims = slice_ddim(bindims, 0, binrank - 2);
auto batchsize = product(info_dims);
framework::Tensor tmp;
std::vector<int> tmpdim(1, batchsize);
tmp.Resize(framework::make_ddim(tmpdim));
int *info = tmp.mutable_data<int>(dev_ctx.GetPlace());
CholeskySolveFunctor<DeviceContext, T> functor;
for (int b = 0; b < batchsize; b++) {
auto uin_data_item = &ut_data[b * n * n];
auto out_data_item = &out_data[b * n * nrhs];
auto info_item = &info[b];
functor(dev_ctx, upper, n, nrhs, uin_data_item, ldab, out_data_item,
info_item);
}
// calculate out's conjugate for complex
platform::ForRange<DeviceContext> out_for_range(dev_ctx, out->numel());
math::ConjFunctor<T> out_functor(
out->data<T>(), out->numel(),
out->mutable_data<T>(out->dims(), dev_ctx.GetPlace()));
out_for_range(out_functor);
*out = helper.Transpose(*out);
}
template <typename DeviceContext, typename T>
class CholeskySolveKernel : public framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext &ctx) const override {
auto *uin = ctx.Input<framework::Tensor>("Y");
auto *bin = ctx.Input<framework::Tensor>("X");
auto *out = ctx.Output<framework::Tensor>("Out");
auto upper = ctx.Attr<bool>("upper");
cholesky_solve_fn<DeviceContext, T>(ctx, *uin, *bin, out, upper);
}
};
template <typename DeviceContext, typename T>
class CholeskySolveGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto *bin = ctx.Input<framework::Tensor>("X");
auto *uin = ctx.Input<framework::Tensor>("Y");
auto *out = ctx.Input<framework::Tensor>("Out");
auto *dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto *db = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
auto *du = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
auto upper = ctx.Attr<bool>("upper");
const auto &dev_ctx = ctx.template device_context<DeviceContext>();
math::DeviceIndependenceTensorOperations<DeviceContext, T> helper(ctx);
std::vector<int64_t> u_bst_dims_vec;
std::vector<int64_t> b_bst_dims_vec;
std::tie(u_bst_dims_vec, b_bst_dims_vec) = get_broadcast_dims(*uin, *bin);
framework::Tensor u_bst(uin->type());
TensorExpand<T, DeviceContext>(dev_ctx, *uin, &u_bst, u_bst_dims_vec);
framework::Tensor db_bst(bin->type());
TensorExpand<T, DeviceContext>(dev_ctx, *bin, &db_bst, b_bst_dims_vec);
if (dout) {
db->mutable_data<T>(dev_ctx.GetPlace());
cholesky_solve_fn<DeviceContext, T>(ctx, u_bst, *dout, &db_bst, upper);
if (db_bst.dims() == db->dims()) {
framework::TensorCopy(db_bst, dev_ctx.GetPlace(), dev_ctx, db);
} else {
MatrixReduceSumFunctor<DeviceContext, T> functor;
functor(db_bst, db, ctx);
db->Resize(bin->dims());
}
auto blas = math::GetBlas<DeviceContext, T>(ctx);
// calculate out's conjugate for complex
framework::Tensor out_conj(out->type());
platform::ForRange<DeviceContext> out_for_range(dev_ctx, out->numel());
math::ConjFunctor<T> out_functor(
out->data<T>(), out->numel(),
out_conj.mutable_data<T>(out->dims(), dev_ctx.GetPlace()));
out_for_range(out_functor);
out_conj = helper.Transpose(out_conj);
framework::Tensor commonterm(out->type());
auto outdims = out_conj.dims();
auto dbdims = db_bst.dims();
auto mat_dim_a = math::CreateMatrixDescriptor(outdims, 0, false);
auto mat_dim_b = math::CreateMatrixDescriptor(dbdims, 0, false);
auto cmtdim = outdims;
cmtdim[cmtdim.size() - 2] = dbdims[dbdims.size() - 2];
commonterm.Resize(cmtdim);
commonterm.mutable_data<T>(dev_ctx.GetPlace());
blas.MatMul(db_bst, mat_dim_b, out_conj, mat_dim_a, static_cast<T>(1),
&commonterm, static_cast<T>(0));
// calculate commonterm's conjugate for complex
framework::Tensor commonterm_conj(commonterm.type());
platform::ForRange<DeviceContext> commonterm_for_range(
dev_ctx, commonterm.numel());
math::ConjFunctor<T> commonterm_functor(
commonterm.data<T>(), commonterm.numel(),
commonterm_conj.mutable_data<T>(commonterm.dims(),
dev_ctx.GetPlace()));
commonterm_for_range(commonterm_functor);
commonterm_conj = helper.Transpose(commonterm_conj);
auto pt_x = paddle::experimental::MakePtenDenseTensor(commonterm);
auto pt_y = paddle::experimental::MakePtenDenseTensor(commonterm_conj);
auto pt_z = paddle::experimental::MakePtenDenseTensor(commonterm);
pten::Add<T>(dev_ctx, *pt_x.get(), *pt_y.get(), -1, pt_z.get());
auto mat_dim_u = math::CreateMatrixDescriptor(u_bst.dims(), 0, false);
auto mat_dim_c =
math::CreateMatrixDescriptor(commonterm.dims(), 0, false);
Tensor du_bst(uin->type());
// get upper or lower triangular
du_bst.Resize(u_bst.dims());
du_bst.mutable_data<T>(dev_ctx.GetPlace());
if (upper) {
blas.MatMul(u_bst, mat_dim_u, commonterm, mat_dim_c, static_cast<T>(-1),
&du_bst, static_cast<T>(0));
} else {
blas.MatMul(commonterm, mat_dim_c, u_bst, mat_dim_u, static_cast<T>(-1),
&du_bst, static_cast<T>(0));
}
const auto &udims = u_bst.dims();
const auto H = udims[udims.size() - 2];
const auto W = udims[udims.size() - 1];
platform::ForRange<DeviceContext> x_for_range(dev_ctx, u_bst.numel());
TrilTriuCompute<T> tril_triu_computer(du_bst.data<T>(), 0, !upper, H, W,
u_bst.data<T>());
x_for_range(tril_triu_computer);
du->mutable_data<T>(dev_ctx.GetPlace());
if (u_bst.dims() == du->dims()) {
framework::TensorCopy(u_bst, dev_ctx.GetPlace(), dev_ctx, du);
} else {
MatrixReduceSumFunctor<DeviceContext, T> functor;
functor(u_bst, du, ctx);
du->Resize(uin->dims());
}
}
}
};
} // namespace operators
} // namespace paddle
......@@ -125,6 +125,38 @@ void lapackEig<platform::complex<float>, float>(
reinterpret_cast<std::complex<float> *>(work), &lwork, rwork, info);
}
template <>
void lapackCholeskySolve<platform::complex<double>>(
char uplo, int n, int nrhs, platform::complex<double> *a, int lda,
platform::complex<double> *b, int ldb, int *info) {
platform::dynload::zpotrs_(
&uplo, &n, &nrhs, reinterpret_cast<std::complex<double> *>(a), &lda,
reinterpret_cast<std::complex<double> *>(b), &ldb, info);
}
template <>
void lapackCholeskySolve<platform::complex<float>>(char uplo, int n, int nrhs,
platform::complex<float> *a,
int lda,
platform::complex<float> *b,
int ldb, int *info) {
platform::dynload::cpotrs_(
&uplo, &n, &nrhs, reinterpret_cast<std::complex<float> *>(a), &lda,
reinterpret_cast<std::complex<float> *>(b), &ldb, info);
}
template <>
void lapackCholeskySolve<double>(char uplo, int n, int nrhs, double *a, int lda,
double *b, int ldb, int *info) {
platform::dynload::dpotrs_(&uplo, &n, &nrhs, a, &lda, b, &ldb, info);
}
template <>
void lapackCholeskySolve<float>(char uplo, int n, int nrhs, float *a, int lda,
float *b, int ldb, int *info) {
platform::dynload::spotrs_(&uplo, &n, &nrhs, a, &lda, b, &ldb, info);
}
} // namespace math
} // namespace operators
} // namespace paddle
......@@ -32,6 +32,10 @@ void lapackEig(char jobvl, char jobvr, int n, T1* a, int lda, T1* w, T1* vl,
int ldvl, T1* vr, int ldvr, T1* work, int lwork, T2* rwork,
int* info);
template <typename T>
void lapackCholeskySolve(char uplo, int n, int nrhs, T* a, int lda, T* b,
int ldb, int* info);
} // namespace math
} // namespace operators
} // namespace paddle
......@@ -19,7 +19,8 @@ namespace paddle {
namespace operators {
template <typename T>
struct MatrixReduceSumFunctor<platform::CUDADeviceContext, T> {
class MatrixReduceSumFunctor<platform::CUDADeviceContext, T> {
public:
void operator()(const Tensor& in, Tensor* out,
const framework::ExecutionContext& ctx) {
// For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3]
......
......@@ -49,6 +49,10 @@ extern void *cusolver_dso_handle;
__macro(cusolverDnDpotrf_bufferSize); \
__macro(cusolverDnSpotrf); \
__macro(cusolverDnDpotrf); \
__macro(cusolverDnSpotrs); \
__macro(cusolverDnDpotrs); \
__macro(cusolverDnCpotrs); \
__macro(cusolverDnZpotrs); \
__macro(cusolverDnSsyevd_bufferSize); \
__macro(cusolverDnDsyevd_bufferSize); \
__macro(cusolverDnCheevd_bufferSize); \
......@@ -64,6 +68,8 @@ CUSOLVER_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP);
#define CUSOLVER_ROUTINE_EACH_R1(__macro) \
__macro(cusolverDnSpotrfBatched); \
__macro(cusolverDnDpotrfBatched); \
__macro(cusolverDnSpotrsBatched); \
__macro(cusolverDnDpotrsBatched); \
__macro(cusolverDnSgesvdj_bufferSize); \
__macro(cusolverDnSgeqrf_bufferSize); \
__macro(cusolverDnDgeqrf_bufferSize); \
......
......@@ -66,6 +66,15 @@ extern "C" void cgeev_(char *jobvl, char *jobvr, int *n, std::complex<float> *a,
std::complex<float> *work, int *lwork, float *rwork,
int *info);
extern "C" void zpotrs_(char *uplo, int *n, int *nrhs, std::complex<double> *a,
int *lda, std::complex<double> *b, int *ldb, int *info);
extern "C" void cpotrs_(char *uplo, int *n, int *nrhs, std::complex<float> *a,
int *lda, std::complex<float> *b, int *ldb, int *info);
extern "C" void dpotrs_(char *uplo, int *n, int *nrhs, double *a, int *lda,
double *b, int *ldb, int *info);
extern "C" void spotrs_(char *uplo, int *n, int *nrhs, float *a, int *lda,
float *b, int *ldb, int *info);
namespace paddle {
namespace platform {
namespace dynload {
......@@ -105,7 +114,11 @@ extern void *lapack_dso_handle;
__macro(dgeev_); \
__macro(sgeev_); \
__macro(zgeev_); \
__macro(cgeev_);
__macro(cgeev_); \
__macro(zpotrs_); \
__macro(cpotrs_); \
__macro(dpotrs_); \
__macro(spotrs_);
LAPACK_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_LAPACK_WRAP);
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.w
from __future__ import print_function
import unittest
import numpy as np
import scipy
import scipy.linalg
import sys
sys.path.append("..")
import paddle
from op_test import OpTest
import paddle.fluid as fluid
from paddle.fluid import Program, program_guard, core
paddle.enable_static()
def cholesky_solution(X, B, upper=True):
if upper:
A = np.triu(X)
L = A.T
U = A
else:
A = np.tril(X)
L = A
U = A.T
return scipy.linalg.solve_triangular(
U, scipy.linalg.solve_triangular(
L, B, lower=True))
def scipy_cholesky_solution(X, B, upper=True):
if upper:
umat = np.triu(X)
A = umat.T @umat
else:
umat = np.tril(X)
A = umat @umat.T
K = scipy.linalg.cho_factor(A)
return scipy.linalg.cho_solve(K, B)
def boardcast_shape(matA, matB):
shapeA = matA.shape
shapeB = matB.shape
Boardshape = []
for idx in range(len(shapeA) - 2):
if shapeA[idx] == shapeB[idx]:
Boardshape.append(shapeA[idx])
continue
elif shapeA[idx] == 1 or shapeB[idx] == 1:
Boardshape.append(max(shapeA[idx], shapeB[idx]))
else:
raise Exception(
'shapeA and shapeB should be boardcasted, but got {} and {}'.
format(shapeA, shapeB))
bsA = Boardshape + list(shapeA[-2:])
bsB = Boardshape + list(shapeB[-2:])
return np.broadcast_to(matA, bsA), np.broadcast_to(matB, bsB)
def scipy_cholesky_solution_batch(bumat, bB, upper=True):
bumat, bB = boardcast_shape(bumat, bB)
ushape = bumat.shape
bshape = bB.shape
bumat = bumat.reshape((-1, ushape[-2], ushape[-1]))
bB = bB.reshape((-1, bshape[-2], bshape[-1]))
batch = 1
for d in ushape[:-2]:
batch *= d
bx = []
for b in range(batch):
# x = scipy_cholesky_solution(bumat[b], bB[b], upper) #large matrix result error
x = cholesky_solution(bumat[b], bB[b], upper)
bx.append(x)
return np.array(bx).reshape(bshape)
# 2D + 2D , , upper=False
class TestCholeskySolveOp(OpTest):
"""
case 1
"""
def config(self):
self.y_shape = [15, 15]
self.x_shape = [15, 5]
self.upper = False
self.dtype = np.float64
def set_output(self):
umat = self.inputs['Y']
self.output = scipy_cholesky_solution_batch(
umat, self.inputs['X'], upper=self.upper)
def setUp(self):
self.op_type = "cholesky_solve"
self.config()
if self.upper:
umat = np.triu(np.random.random(self.y_shape).astype(self.dtype))
else:
umat = np.tril(np.random.random(self.y_shape).astype(self.dtype))
self.inputs = {
'X': np.random.random(self.x_shape).astype(self.dtype),
'Y': umat
}
self.attrs = {'upper': self.upper}
self.set_output()
self.outputs = {'Out': self.output}
def test_check_output(self):
self.check_output()
def test_check_grad_normal(self):
self.check_grad(['Y'], 'Out', max_relative_error=0.01)
# 3D(broadcast) + 3D, upper=True
class TestCholeskySolveOp3(TestCholeskySolveOp):
"""
case 3
"""
def config(self):
self.y_shape = [1, 10, 10]
self.x_shape = [2, 10, 5]
self.upper = True
self.dtype = np.float64
class TestCholeskySolveAPI(unittest.TestCase):
def setUp(self):
np.random.seed(2021)
self.place = [paddle.CPUPlace()]
# self.place = [paddle.CUDAPlace(0)]
self.dtype = "float64"
self.upper = True
if core.is_compiled_with_cuda():
self.place.append(paddle.CUDAPlace(0))
def check_static_result(self, place):
paddle.enable_static()
with fluid.program_guard(fluid.Program(), fluid.Program()):
x = fluid.data(name="x", shape=[10, 2], dtype=self.dtype)
y = fluid.data(name="y", shape=[10, 10], dtype=self.dtype)
z = paddle.linalg.cholesky_solve(x, y, upper=self.upper)
x_np = np.random.random([10, 2]).astype(self.dtype)
y_np = np.random.random([10, 10]).astype(self.dtype)
if self.upper:
umat = np.triu(y_np)
else:
umat = np.tril(y_np)
z_np = cholesky_solution(umat, x_np, upper=self.upper)
z2_np = scipy_cholesky_solution(umat, x_np, upper=self.upper)
exe = fluid.Executor(place)
fetches = exe.run(fluid.default_main_program(),
feed={"x": x_np,
"y": umat},
fetch_list=[z])
self.assertTrue(np.allclose(fetches[0], z_np))
def test_static(self):
for place in self.place:
self.check_static_result(place=place)
def test_dygraph(self):
def run(place):
paddle.disable_static(place)
x_np = np.random.random([20, 2]).astype(self.dtype)
y_np = np.random.random([20, 20]).astype(self.dtype)
z_np = scipy_cholesky_solution(y_np, x_np, upper=self.upper)
x = paddle.to_tensor(x_np)
y = paddle.to_tensor(y_np)
z = paddle.linalg.cholesky_solve(x, y, upper=self.upper)
self.assertTrue(np.allclose(z_np, z.numpy()))
self.assertEqual(z_np.shape, z.numpy().shape)
paddle.enable_static()
for idx, place in enumerate(self.place):
run(place)
def test_boardcast(self):
def run(place):
paddle.disable_static()
x_np = np.random.random([1, 30, 2]).astype(self.dtype)
y_np = np.random.random([2, 30, 30]).astype(self.dtype)
nx_np = np.concatenate((x_np, x_np), axis=0)
z_sci = scipy_cholesky_solution_batch(y_np, nx_np, upper=self.upper)
x = paddle.to_tensor(x_np)
y = paddle.to_tensor(y_np)
z = paddle.linalg.cholesky_solve(x, y, upper=self.upper)
self.assertEqual(z_sci.shape, z.numpy().shape)
self.assertTrue(np.allclose(z_sci, z.numpy()))
for idx, place in enumerate(self.place):
run(place)
class TestCholeskySolveOpError(unittest.TestCase):
def test_errors(self):
paddle.enable_static()
with program_guard(Program(), Program()):
# The input type of solve_op must be Variable.
x1 = fluid.create_lod_tensor(
np.array([[-1]]), [[1]], fluid.CPUPlace())
y1 = fluid.create_lod_tensor(
np.array([[-1]]), [[1]], fluid.CPUPlace())
self.assertRaises(TypeError, paddle.linalg.cholesky_solve, x1, y1)
# The data type of input must be float32 or float64.
x2 = fluid.data(name="x2", shape=[30, 30], dtype="bool")
y2 = fluid.data(name="y2", shape=[30, 10], dtype="bool")
self.assertRaises(TypeError, paddle.linalg.cholesky_solve, x2, y2)
x3 = fluid.data(name="x3", shape=[30, 30], dtype="int32")
y3 = fluid.data(name="y3", shape=[30, 10], dtype="int32")
self.assertRaises(TypeError, paddle.linalg.cholesky_solve, x3, y3)
x4 = fluid.data(name="x4", shape=[30, 30], dtype="float16")
y4 = fluid.data(name="y4", shape=[30, 10], dtype="float16")
self.assertRaises(TypeError, paddle.linalg.cholesky_solve, x4, y4)
# The number of dimensions of input'X must be >= 2.
x5 = fluid.data(name="x5", shape=[30], dtype="float64")
y5 = fluid.data(name="y5", shape=[30, 30], dtype="float64")
self.assertRaises(ValueError, paddle.linalg.cholesky_solve, x5, y5)
# The number of dimensions of input'Y must be >= 2.
x6 = fluid.data(name="x6", shape=[30, 30], dtype="float64")
y6 = fluid.data(name="y6", shape=[30], dtype="float64")
self.assertRaises(ValueError, paddle.linalg.cholesky_solve, x6, y6)
# The inner-most 2 dimensions of input'X should be equal to each other
x7 = fluid.data(name="x7", shape=[2, 3, 4], dtype="float64")
y7 = fluid.data(name="y7", shape=[2, 4, 3], dtype="float64")
self.assertRaises(ValueError, paddle.linalg.cholesky_solve, x7, y7)
if __name__ == "__main__":
unittest.main()
......@@ -49,6 +49,7 @@ NEED_FIX_FP64_CHECK_GRAD_THRESHOLD_OP_LIST = [
'sparse_attention', \
'svd', \
'matrix_power', \
'cholesky_solve', \
'solve', \
]
......
......@@ -18,18 +18,19 @@ from .tensor.linalg import eig # noqa: F401
from .tensor.linalg import cond # noqa: F401
from .tensor.linalg import matrix_power # noqa: F401
from .tensor.linalg import solve # noqa: F401
from .tensor.linalg import cholesky_solve # noqa: F401
from .tensor import inverse as inv # noqa: F401
from .tensor.linalg import eigvals # noqa: F401
from .tensor.linalg import multi_dot # noqa: F401
from .tensor.linalg import matrix_rank
from .tensor.linalg import svd
from .tensor.linalg import eigvalsh
from .tensor.linalg import qr
from .tensor.linalg import matrix_rank # noqa: F401
from .tensor.linalg import svd # noqa: F401
from .tensor.linalg import eigvalsh # noqa: F401
from .tensor.linalg import qr # noqa: F401
from .tensor.linalg import eigh # noqa: F401
from .tensor.linalg import det
from .tensor.linalg import slogdet
from .tensor.linalg import pinv
from .tensor.linalg import triangular_solve
from .tensor.linalg import det # noqa: F401
from .tensor.linalg import slogdet # noqa: F401
from .tensor.linalg import pinv # noqa: F401
from .tensor.linalg import triangular_solve # noqa: F401
__all__ = [
'cholesky', #noqa
......@@ -49,5 +50,6 @@ __all__ = [
'eigvalsh',
'pinv',
'solve',
'cholesky_solve',
'triangular_solve',
]
......@@ -60,6 +60,7 @@ from .linalg import eigvalsh # noqa: F401
from .linalg import eigh # noqa: F401
from .linalg import pinv # noqa: F401
from .linalg import solve # noqa: F401
from .linalg import cholesky_solve # noqa: F401
from .logic import equal # noqa: F401
from .logic import greater_equal # noqa: F401
from .logic import greater_than # noqa: F401
......@@ -433,6 +434,7 @@ tensor_method_func = [ #noqa
'uniform_',
'multi_dot',
'solve',
'cholesky_solve',
'triangular_solve',
'asinh',
'atanh',
......
......@@ -2388,6 +2388,56 @@ def triangular_solve(x,
return out
def cholesky_solve(x, y, upper=False, name=None):
r"""
Solves a linear system of equations A @ X = B, given A's Cholesky factor matrix u and matrix B.
Input `x` and `y` is 2D matrices or batches of 2D matrices. If the inputs are batches, the outputs
is also batches.
Args:
x (Tensor): The input matrix which is upper or lower triangular Cholesky factor of square matrix A. Its shape should be `[*, M, M]`, where `*` is zero or
more batch dimensions. Its data type should be float32 or float64.
y (Tensor): Multiple right-hand sides of system of equations. Its shape should be `[*, M, K]`, where `*` is
zero or more batch dimensions. Its data type should be float32 or float64.
upper (bool, optional): whether to consider the Cholesky factor as a lower or upper triangular matrix. Default: False.
name(str, optional): Name for the operation (optional, default is None).
For more information, please refer to :ref:`api_guide_Name`.
Returns:
Tensor: The solution of the system of equations. Its data type is the same as that of `x`.
Examples:
.. code-block:: python
import paddle
u = paddle.to_tensor([[1, 1, 1],
[0, 2, 1],
[0, 0,-1]], dtype="float64")
b = paddle.to_tensor([[0], [-9], [5]], dtype="float64")
out = paddle.linalg.cholesky_solve(b, u, upper=True)
print(out)
# [-2.5, -7, 9.5]
"""
if in_dygraph_mode():
return _C_ops.cholesky_solve(x, y, 'upper', upper)
helper = LayerHelper("cholesky_solve", **locals())
check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'cholesky_solve')
check_variable_and_dtype(y, 'y', ['float32', 'float64'], 'cholesky_solve')
out = helper.create_variable_for_type_inference(dtype=x.dtype)
helper.append_op(
type='cholesky_solve',
inputs={'X': x,
'Y': y},
outputs={'Out': out},
attrs={'upper': upper})
return out
def eigvalsh(x, UPLO='L', name=None):
"""
Computes the eigenvalues of a
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册