未验证 提交 8caf951c 编写于 作者: W Weilong Wu 提交者: GitHub

Add paddle.linalg.solve OP (#35715)

* Add linalg.solve op, test=develop

* Fix a bug caused by accidental deletion

* updated description and fix a bug: missing a comma

* Add linalg.solve op, test=develop

* updated solve op backward logic

* updated solve op backward logic again

* Add linalg.solve Op, test=develop

* Updated and modified to fit CI requirements

* Fix a bug

* 1)Add more test cases; 2)Fix a wrong usage in reduces operation; 3)Remove redundant code

* Remove redundant comments

* 1)Removed redundant code; 2)Updated to enhance code robustness

* Removed redundant code

* Updated API documents
上级 4c35f515
......@@ -123,7 +123,7 @@ lod_tensor maxouting unpooling pooling lod_rank_table context_project
sequence_pooling segment_pooling executor device_memory_aligment generator)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc matrix_inverse)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc matrix_inverse matrix_solve)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} box_wrapper boost ps_gpu_wrapper)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} common_infer_shape_functions)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} eigen_function)
......
......@@ -89,6 +89,7 @@ math_library(bert_encoder_functor)
math_library(tree2col DEPS math_function)
math_library(matrix_inverse)
math_library(segment_pooling)
math_library(matrix_solve)
cc_test(math_function_test SRCS math_function_test.cc DEPS math_function)
cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor)
......
......@@ -247,6 +247,12 @@ class Blas {
template <typename T>
void BatchedMatInv(int n, const T** a, T** a_inv, int* info,
int batch_size) const;
// cuBlas solve
template <typename T>
void BatchedGETRS(CBLAS_TRANSPOSE trans, int n, int nrhs, const T** a,
int lda, int* ipiv, T** b, int ldb, int* info,
int batch_size) const;
#endif
private:
......@@ -402,6 +408,12 @@ class BlasT : private Blas<DeviceContext> {
void BatchedMatInv(ARGS... args) const {
Base()->template BatchedMatInv<T>(args...);
}
// solve
template <typename... ARGS>
void BatchedGETRS(ARGS... args) const {
Base()->template BatchedGETRS<T>(args...);
}
#endif
private:
......
......@@ -114,6 +114,12 @@ struct CUBlas<float> {
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cublasSmatinvBatched(args...));
}
template <typename... ARGS>
static void GETRS_BATCH(ARGS... args) {
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cublasSgetrsBatched(args...));
}
};
template <>
......@@ -182,6 +188,12 @@ struct CUBlas<double> {
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cublasDmatinvBatched(args...));
}
template <typename... ARGS>
static void GETRS_BATCH(ARGS... args) {
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cublasDgetrsBatched(args...));
}
};
template <>
......@@ -871,6 +883,20 @@ void Blas<platform::CUDADeviceContext>::BatchedMatInv(int n, const T **a,
});
}
template <>
template <typename T>
void Blas<platform::CUDADeviceContext>::BatchedGETRS(
CBLAS_TRANSPOSE trans, int n, int nrhs, const T **a, int lda, int *ipiv,
T **b, int ldb, int *info, int batch_size) const {
// use CUBLAS_OP_C (conjugate transpose) for complex
cublasOperation_t cuTrans =
(trans == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<T>::GETRS_BATCH(handle, cuTrans, n, nrhs, a, lda, ipiv, b, ldb, info,
batch_size);
});
}
} // namespace math
} // namespace operators
} // namespace paddle
......@@ -717,6 +717,19 @@ void Blas<platform::CUDADeviceContext>::BatchedMatInv(int n, const T **a,
});
}
template <>
template <typename T>
void Blas<platform::CUDADeviceContext>::BatchedGETRS(
CBLAS_TRANSPOSE trans, int n, int nrhs, const T **a, int lda, int *ipiv,
T **b, int ldb, int *info, int batch_size) const {
rocblas_operation cuTrans = (trans == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
context_.CublasCall([&](rocblas_handle handle) {
CUBlas<T>::GETRS_BATCH(handle, cuTrans, n, nrhs, a, lda, ipiv, b, ldb, info,
batch_size);
});
}
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/matrix_solve.h"
#include "Eigen/Core"
#include "Eigen/LU"
#include "paddle/fluid/operators/math/blas.h"
namespace paddle {
namespace operators {
namespace math {
template <typename T>
class MatrixSolveFunctor<platform::CPUDeviceContext, T> {
public:
void operator()(const platform::CPUDeviceContext& dev_ctx,
const framework::Tensor& a, const framework::Tensor& b,
framework::Tensor* out) {
compute_solve_eigen<platform::CPUDeviceContext, T>(dev_ctx, a, b, out);
}
};
template class MatrixSolveFunctor<platform::CPUDeviceContext, float>;
template class MatrixSolveFunctor<platform::CPUDeviceContext, double>;
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/matrix_solve.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/solve_op.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace platform {
class CUDADeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace operators {
namespace math {
template <typename DeviceContext, typename T>
class MatrixSolveFunctor;
template <typename T>
class MatrixSolveFunctor<platform::CUDADeviceContext, T> {
public:
void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& a, const framework::Tensor& b,
framework::Tensor* out) {
#ifndef PADDLE_WITH_HIP
// solve the equation: Ax = B,
// use cuBlas cublas<S/D>getrfBatched funcion to performs the LU
// factorization of each matrix A,
// and then use cuBlas cublas<S/D>getriBatched function to solve the
// equation after LU factorization.
// ref:
// https://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-getrfbatched
const auto& a_dims = a.dims();
const int a_rank = a_dims.size();
int n = a_dims[a_rank - 1];
int lda = n;
int batch_size = a_rank > 2 ? a.numel() / (n * n) : 1;
const auto& b_dims = b.dims();
const int b_rank = b_dims.size();
int nrhs = b_dims[b_rank - 1];
int ldb = b_dims[b_rank - 2];
// make sure the out dims is right
out->Resize(b_dims);
out->mutable_data<T>(context.GetPlace());
// copy input A to a temporary tensor tmp_a,
// LU factorization, written back to original matrix A, so in the beginning,
// it's necessary to create a temporary tensor tmp_a.
Tensor tmp_a(a.type());
tmp_a.Resize(a.dims());
tmp_a.mutable_data<T>(context.GetPlace());
TensorCopy(a, context.GetPlace(), &tmp_a);
// copy input B to a temporary tensor tmp_b, and transpose tmp_b,
// because cuBlas assumes column-major while Paddle uses row-majar.
Tensor tmp_b(b.type());
const auto& new_dims_vec = getNewDimsVec(b_dims);
tmp_b.Resize(framework::make_ddim(new_dims_vec));
tmp_b.mutable_data<T>(context.GetPlace());
math::TransposeNormal<platform::CUDADeviceContext, T> trans;
std::vector<int> new_axis = getNewAxis(b_rank);
trans(context, b, &tmp_b, new_axis);
const T* a_data_in_gpu = tmp_a.data<T>();
const T* b_data_in_gpu = tmp_b.data<T>();
std::vector<const T*> cpu_ptrs(batch_size * 2);
for (int i = 0; i < batch_size; ++i) {
cpu_ptrs[i] = a_data_in_gpu + i * n * n;
cpu_ptrs[i + batch_size] = b_data_in_gpu + i * n * nrhs;
}
// Copy the addresses of A and tmp_b from host to device.
memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
memory::Alloc(context, cpu_ptrs.size() * sizeof(T*));
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
tmp_gpu_ptrs_data->ptr(), platform::CPUPlace(),
static_cast<void*>(cpu_ptrs.data()),
cpu_ptrs.size() * sizeof(T*), context.stream());
T** gpu_tmp_b_ptrs =
reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()) + batch_size;
// Allocate device memory for BatchedGETRF's info and pivots.
int num_ints = n < 32 ? batch_size : batch_size * (n + 1);
memory::allocation::AllocationPtr tmp_gpu_info_data =
memory::Alloc(context, num_ints * sizeof(int));
int* gpu_info_ptr = reinterpret_cast<int*>(tmp_gpu_info_data->ptr());
auto blas = math::GetBlas<platform::CUDADeviceContext, T>(context);
// only for singular checking
std::vector<int> info;
info.resize(batch_size);
int* gpu_pivot_ptr =
reinterpret_cast<int*>(tmp_gpu_info_data->ptr()) + batch_size;
// This function performs the LU factorization of each matrix A by the
// equation A = L * U. L and U are written back to original matrix A,
// and diagonal elements of L are discarded.
blas.BatchedGETRF(n, reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()),
gpu_pivot_ptr, gpu_info_ptr, batch_size);
// check whether BatchedGETRF is executed successfully or not
memory::Copy(platform::CPUPlace(), info.data(),
BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
gpu_info_ptr, sizeof(int) * batch_size, context.stream());
for (int i = 0; i < batch_size; ++i) {
PADDLE_ENFORCE_EQ(info[i], 0,
platform::errors::PreconditionNotMet(
"For batch [%d]: U(%d, %d) is zero, singular U. "
"Please check the matrix value and change it to a "
"non-singular matrix",
i, info[i], info[i]));
}
// hold the result code from BatchedGETRS
int host_info = 0;
// to solve the equation after LU factorization
CBLAS_TRANSPOSE transA = CblasTrans;
blas.BatchedGETRS(
transA, n, nrhs, reinterpret_cast<const T**>(tmp_gpu_ptrs_data->ptr()),
lda, gpu_pivot_ptr, gpu_tmp_b_ptrs, ldb, &host_info, batch_size);
// check whether BatchedGETRS is executed successfully or not
PADDLE_ENFORCE_EQ(host_info, 0,
platform::errors::InvalidArgument(
"The [%d]'th argument to cublas*getrsBatched had "
"an illegal value.",
-host_info));
// transpose tmp_b to get the final result in row-major form.
math::TransposeNormal<platform::CUDADeviceContext, T> trans2;
trans2(context, tmp_b, out, new_axis);
#else
compute_solve_eigen<platform::CUDADeviceContext, T>(context, a, b, out);
#endif
}
};
template class MatrixSolveFunctor<platform::CUDADeviceContext, float>;
template class MatrixSolveFunctor<platform::CUDADeviceContext, double>;
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "Eigen/Core"
#include "Eigen/LU"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace operators {
namespace math {
template <typename DeviceContext, typename T>
void compute_solve_eigen(const DeviceContext& context,
const framework::Tensor& a, const framework::Tensor& b,
framework::Tensor* out) {
using Matrix =
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
using EigenMatrixMap = Eigen::Map<Matrix>;
using ConstEigenMatrixMap = Eigen::Map<const Matrix>;
// prepare for a
const auto& a_mat_dims = a.dims();
const int a_rank = a_mat_dims.size();
int n = a_mat_dims[a_rank - 1];
int a_batch_size = a_rank > 2 ? a.numel() / (n * n) : 1;
// prepare for b
const auto& b_mat_dims = b.dims();
const int b_rank = b_mat_dims.size();
int b_h = n;
int b_w = b_mat_dims[b_rank - 1];
int b_batch_size = b_rank > 2 ? b.numel() / (b_h * b_w) : 1;
const T* a_ptr = a.data<T>();
const T* b_ptr = b.data<T>();
out->Resize(b_mat_dims); // make sure the out dims is right
T* out_ptr = out->mutable_data<T>(context.GetPlace());
if (a_batch_size == b_batch_size) {
for (int i = 0; i < a_batch_size; ++i) {
ConstEigenMatrixMap a_mat(a_ptr + i * n * n, n, n);
ConstEigenMatrixMap b_mat(b_ptr + i * b_h * b_w, b_h, b_w);
EigenMatrixMap out_mat(out_ptr + i * b_h * b_w, b_h, b_w);
Eigen::PartialPivLU<Matrix> lu;
lu.compute(a_mat);
const T min_abs_pivot = lu.matrixLU().diagonal().cwiseAbs().minCoeff();
PADDLE_ENFORCE_GT(
min_abs_pivot, static_cast<T>(0),
platform::errors::InvalidArgument("Input is not invertible."));
out_mat.noalias() = lu.solve(b_mat);
}
} else {
PADDLE_ENFORCE_EQ(a_batch_size, b_batch_size,
platform::errors::InvalidArgument(
"All input tensors must have the same rank."));
}
}
template <typename DeviceContext, typename T>
class MatrixSolveFunctor {
public:
void operator()(const DeviceContext& context, const framework::Tensor& a,
const framework::Tensor& b, framework::Tensor* out);
};
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/solve_op.h"
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/ddim.h"
namespace paddle {
namespace operators {
using framework::OpKernelType;
using framework::Tensor;
class SolveOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Solve");
OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "Solve");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Solve");
auto x_dims = ctx->GetInputDim("X");
auto y_dims = ctx->GetInputDim("Y");
std::vector<int64_t> x_dims_vec =
paddle::framework::vectorize(ctx->GetInputDim("X"));
std::vector<int64_t> y_dims_vec =
paddle::framework::vectorize(ctx->GetInputDim("Y"));
auto x_dims_n = x_dims_vec.size();
auto y_dims_n = y_dims_vec.size();
PADDLE_ENFORCE_GT(x_dims_n, 1,
platform::errors::InvalidArgument(
"The input tensor X's dimensions of SolveOp "
"should be larger than 1. But received X's "
"dimensions = %d, X's shape = [%s]",
x_dims_n, x_dims));
PADDLE_ENFORCE_GE(y_dims_n, 1,
platform::errors::InvalidArgument(
"The input tensor Y's dimensions of SolveOp "
"should be larger than or equal 1. But received Y's "
"dimensions = %d, Y's shape = [%s]",
y_dims_n, y_dims));
PADDLE_ENFORCE_EQ(x_dims[x_dims_n - 2], x_dims[x_dims_n - 1],
platform::errors::InvalidArgument(
"The inner-most 2 dimensions of Input(X) all should "
"be square matrices "
"But received X's shape[-2] = %d and shape[-1] = %d.",
x_dims[x_dims_n - 2], x_dims[x_dims_n - 1]));
bool x_broadcasted = false, y_broadcasted = false;
bool trans_x = false, trans_y = false;
if (x_dims_n == 1) {
x_dims_vec.insert(x_dims_vec.begin(), 1);
x_dims_n = 2;
x_broadcasted = true;
}
if (y_dims_n == 1) {
y_dims_vec.push_back(1);
y_dims_n = 2;
y_broadcasted = true;
}
size_t M, N;
if (trans_x) {
M = x_dims_vec[x_dims_n - 1];
} else {
M = x_dims_vec[x_dims_n - 2];
}
if (trans_y) {
N = y_dims_vec[y_dims_n - 2];
} else {
N = y_dims_vec[y_dims_n - 1];
}
std::vector<int64_t> new_dims;
if (x_dims_n >= y_dims_n) {
new_dims.assign(x_dims_vec.begin(), x_dims_vec.end() - 2);
} else {
new_dims.assign(y_dims_vec.begin(), y_dims_vec.end() - 2);
}
if (!x_broadcasted) {
new_dims.push_back(M);
}
if (!y_broadcasted) {
new_dims.push_back(N);
}
if (x_broadcasted && y_broadcasted) {
new_dims.push_back(1);
}
auto out_dims = framework::make_ddim(new_dims);
ctx->SetOutputDim("Out", out_dims);
ctx->ShareLoD("X", /*->*/ "Out");
}
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const {
framework::LibraryType library = framework::LibraryType::kPlain;
framework::DataLayout layout = framework::DataLayout::kAnyLayout;
int customized_type_value =
framework::OpKernelType::kDefaultCustomizedTypeValue;
auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
library, customized_type_value);
}
};
class SolveOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "(Tensor), The first input tensor of solve op.");
AddInput("Y", "(Tensor), The second input tensor of solve op.");
AddOutput("Out", "(Tensor), The output tensor of solve op.");
AddComment(R"DOC(
Solve Operator.
This operator is used to computes the solution of a square system of
linear equations with a unique solution for input $X$ and $Y$.
The equation is:
$$Out = X^-1 * Y$$
)DOC");
}
};
class SolveOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
protected:
std::unordered_map<std::string, std::string>& GetInputOutputWithSameType()
const override {
static std::unordered_map<std::string, std::string> m{{"X", /*->*/ "Out"}};
return m;
}
};
class SolveGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "solve");
OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "solve");
OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
"Out@GRAD", "solve");
// reuse the linalg.solve forward output
OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "solve");
auto x_dims = ctx->GetInputDim("X");
auto y_dims = ctx->GetInputDim("Y");
auto x_grad_name = framework::GradVarName("X");
auto y_grad_name = framework::GradVarName("Y");
if (ctx->HasOutput(x_grad_name)) {
ctx->SetOutputDim(x_grad_name, x_dims);
}
if (ctx->HasOutput(y_grad_name)) {
ctx->SetOutputDim(y_grad_name, y_dims);
}
}
};
template <typename T>
class SolveOpGradMaker : public framework::SingleGradOpMaker<T> {
public:
using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
protected:
void Apply(GradOpPtr<T> retv) const override {
retv->SetType("solve_grad");
retv->SetInput("X", this->Input("X"));
retv->SetInput("Y", this->Input("Y"));
retv->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
// reuse the linalg.solve forward output
retv->SetInput("Out", this->Output("Out"));
retv->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
retv->SetOutput(framework::GradVarName("Y"), this->InputGrad("Y"));
retv->SetAttrMap(this->Attrs());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(solve, ops::SolveOp, ops::SolveOpMaker,
ops::SolveOpInferVarType,
ops::SolveOpGradMaker<paddle::framework::OpDesc>,
ops::SolveOpGradMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(solve_grad, ops::SolveGradOp);
REGISTER_OP_CPU_KERNEL(
solve, ops::SolveKernel<paddle::platform::CPUDeviceContext, float>,
ops::SolveKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
solve_grad, ops::SolveGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::SolveGradKernel<paddle::platform::CPUDeviceContext, double>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/solve_op.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(solve, ops::SolveKernel<plat::CUDADeviceContext, float>,
ops::SolveKernel<plat::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(solve_grad,
ops::SolveGradKernel<plat::CUDADeviceContext, float>,
ops::SolveGradKernel<plat::CUDADeviceContext, double>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "Eigen/Core"
#include "Eigen/LU"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/eigen/eigen_function.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/matrix_solve.h"
#include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h"
#include "paddle/fluid/operators/squeeze_op.h"
#if defined(__NVCC__) || defined(__HIPCC__)
#include "paddle/fluid/operators/reduce_ops/cub_reduce.h"
#endif
#define MAX_RANK_SUPPORTED 6
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using framework::To32BitIndex;
constexpr int kMULMKLDNNINT8 = 1;
struct IdentityFunctor {
HOSTDEVICE explicit inline IdentityFunctor() {}
template <typename U>
HOSTDEVICE inline U operator()(const U& x) const {
return x;
}
};
template <typename DeviceContext, typename T>
void ReduceSumForSolveGrad(const Tensor* input, Tensor* output,
const std::vector<int>& reduce_dims, bool keep_dim,
const paddle::framework::ExecutionContext& ctx) {
#if defined(__NVCC__) || defined(__HIPCC__)
auto stream = ctx.cuda_device_context().stream();
TensorReduce<T, T, cub::Sum, IdentityFunctor>(*input, output, reduce_dims,
static_cast<T>(0), cub::Sum(),
IdentityFunctor(), stream);
#else
ReduceKernelFunctor<DeviceContext, T, ops::SumFunctor>(
input, output, reduce_dims, keep_dim, false, ctx)
.template apply<T>();
#endif
}
// check the input other is vector_case or not
static inline bool is_vector_rhs(const Tensor& input, const Tensor& other) {
auto x_dim = input.dims();
auto y_dim = other.dims();
auto x_dim_size = x_dim.size();
auto y_dim_size = y_dim.size();
std::vector<int64_t> x_dims_vec = paddle::framework::vectorize(x_dim);
std::vector<int64_t> y_dims_vec = paddle::framework::vectorize(y_dim);
std::vector<int64_t>::const_iterator f = x_dims_vec.begin();
std::vector<int64_t>::const_iterator l = x_dims_vec.end() - 1;
std::vector<int64_t> x_dims_vec_cut(f, l); // input.shape[:-1]
std::vector<int64_t> expected_batched_rhs_shape(x_dims_vec_cut);
bool vector_case =
y_dim_size == 1 || (x_dim_size - 1 == y_dim_size &&
y_dims_vec == (expected_batched_rhs_shape));
return vector_case;
}
// unsqueeze operation helper
static framework::DDim GetOutputShapeUnsqueeze(
const std::vector<int> unsqz_dims, const framework::DDim& in_dims) {
int output_size = in_dims.size() + static_cast<int>(unsqz_dims.size());
int cur_output_size = in_dims.size();
std::vector<int64_t> output_shape(output_size, 0);
// Validity Check: rank range.
PADDLE_ENFORCE_LE(output_size, 6,
platform::errors::InvalidArgument(
"The output "
"tensor's rank should be less than 6."));
for (int axis : unsqz_dims) {
int cur = axis < 0 ? axis + cur_output_size + 1 : axis;
// Vaildity Check: the axis bound
PADDLE_ENFORCE_GE(cur, 0, platform::errors::InvalidArgument(
"The insert dimension value should "
"not be less than 0"));
PADDLE_ENFORCE_LE(cur, cur_output_size,
platform::errors::InvalidArgument(
"The insert dimension value shoule not be larger "
"than the dimension size of input tensor"));
// Move old axis, and insert new axis
for (int i = cur_output_size; i >= cur; --i) {
if (output_shape[i] == 1) {
// Move axis
output_shape[i + 1] = 1;
output_shape[i] = 0;
}
}
output_shape[cur] = 1;
// Add the output size.
cur_output_size++;
}
// Make output shape
for (int in_idx = 0, out_idx = 0; out_idx < output_size; ++out_idx) {
if (output_shape[out_idx] == 0) {
output_shape[out_idx] = in_dims[in_idx++];
}
}
return framework::make_ddim(output_shape);
}
// operation like squeeze(-1)
static void to_squeeze(const framework::ExecutionContext& context,
const framework::Tensor& in, framework::Tensor* out) {
auto x_dims = in.dims();
std::vector<int> sqz_dims = {-1};
auto out_dims = GetOutputShape(sqz_dims, x_dims, true);
out->mutable_data(context.GetPlace(), in.type());
framework::TensorCopy(
in, context.GetPlace(),
context.template device_context<platform::DeviceContext>(), out);
out->Resize(out_dims);
}
// vector_case, need to operate like unsqueeze(-1)
static void to_unsqueeze(const framework::ExecutionContext& context,
const framework::Tensor& in, framework::Tensor* out) {
auto x_dims = in.dims();
std::vector<int> unsqz_dims = {-1};
framework::DDim out_dims = out->dims();
out_dims = GetOutputShapeUnsqueeze(unsqz_dims, x_dims);
framework::TensorCopy(
in, context.GetPlace(),
context.template device_context<platform::DeviceContext>(), out);
out->Resize(out_dims);
}
template <typename Container>
Container infer_size_impl(std::vector<int64_t> a, std::vector<int64_t> b) {
size_t dimsA = a.size();
size_t dimsB = b.size();
size_t ndim = dimsA > dimsB ? dimsA : dimsB;
Container expandedSizes(ndim);
for (ptrdiff_t i = (ptrdiff_t)ndim - 1; i >= 0; --i) {
ptrdiff_t offset = ndim - 1 - i;
ptrdiff_t dimA = dimsA - 1 - offset;
ptrdiff_t dimB = dimsB - 1 - offset;
int64_t sizeA = (dimA >= 0) ? a[dimA] : 1;
int64_t sizeB = (dimB >= 0) ? b[dimB] : 1;
PADDLE_ENFORCE_EQ(
(sizeA == sizeB || sizeA == 1 || sizeB == 1), true,
platform::errors::PreconditionNotMet(
"The size of tensor a (%d) must match the size of tensor b "
"(%d) at non-singleton dimension %d.",
sizeA, sizeB, i));
expandedSizes[i] = sizeA == 1 ? sizeB : sizeA;
}
return expandedSizes;
}
// infer size for broadcast operation
static std::vector<int64_t> infer_size(std::vector<int64_t> a,
std::vector<int64_t> b) {
return infer_size_impl<std::vector<int64_t>>(a, b);
}
// necessary check before expand operation
static void expand_check(const Tensor& arg1,
std::vector<int64_t> expand_shape) {
auto rank = arg1.dims().size();
PADDLE_ENFORCE_GE(
rank, 1, platform::errors::InvalidArgument(
"The rank of the input 'X' for expand must be positive, "
"but the value received is %d.",
rank));
PADDLE_ENFORCE_LE(
rank, MAX_RANK_SUPPORTED,
platform::errors::InvalidArgument(
"The rank of the input 'X' for expand must be less than "
"or equal to %d, but the value received is %d.",
MAX_RANK_SUPPORTED, rank));
auto shape_size = static_cast<int>(expand_shape.size());
PADDLE_ENFORCE_GE(
shape_size, rank,
platform::errors::InvalidArgument(
"The number (%d) of elements of 'shape' for expand must be "
"greater than or equal to the rank (%d) of the input 'X'.",
shape_size, rank));
PADDLE_ENFORCE_LE(
shape_size, MAX_RANK_SUPPORTED,
platform::errors::InvalidArgument(
"The number (%d) of elements of 'shape' for expand must be "
"less than or equal to %d.",
shape_size, MAX_RANK_SUPPORTED));
}
// broadcast the batch dimensions of arg1 and arg2.
static inline std::tuple<std::vector<int64_t>, std::vector<int64_t>>
_broadcast_batch_dims(const Tensor& arg1, const Tensor& arg2) {
std::vector<int64_t> arg1_dims_vec =
paddle::framework::vectorize(arg1.dims());
std::vector<int64_t> arg2_dims_vec =
paddle::framework::vectorize(arg2.dims());
std::vector<int64_t>::const_iterator f1 = arg1_dims_vec.begin();
std::vector<int64_t>::const_iterator l1 = arg1_dims_vec.end() - 2;
std::vector<int64_t> arg1_dims_vec_cut(f1, l1);
std::vector<int64_t>::const_iterator f2 = arg2_dims_vec.begin();
std::vector<int64_t>::const_iterator l2 = arg2_dims_vec.end() - 2;
std::vector<int64_t> arg2_dims_vec_cut(f2, l2);
std::vector<int64_t> expand_batch_portion =
infer_size(arg1_dims_vec_cut, arg2_dims_vec_cut);
std::vector<int64_t> arg1_expand_size({expand_batch_portion});
arg1_expand_size.insert(
arg1_expand_size.end(),
{arg1_dims_vec[static_cast<int>(arg1_dims_vec.size()) - 2],
arg1_dims_vec[static_cast<int>(arg1_dims_vec.size()) - 1]});
std::vector<int64_t> arg2_expand_size({expand_batch_portion});
arg2_expand_size.insert(
arg2_expand_size.end(),
{arg2_dims_vec[static_cast<int>(arg2_dims_vec.size()) - 2],
arg2_dims_vec[static_cast<int>(arg2_dims_vec.size()) - 1]});
return std::make_tuple(arg1_expand_size, arg2_expand_size);
}
template <int Rank, typename T, typename DeviceContext>
void tensor_expand(const framework::ExecutionContext& context,
const Tensor& arg1, Tensor* out0,
std::vector<int64_t> expand_size) {
auto in_dims = arg1.dims();
auto expand_shape = expand_size;
auto vec_in_dims = framework::vectorize<int>(in_dims);
auto diff = expand_shape.size() - vec_in_dims.size();
vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
std::vector<int> repeat_times(vec_in_dims.size());
for (size_t i = 0; i < vec_in_dims.size(); ++i) {
PADDLE_ENFORCE_NE(
expand_shape[i], 0,
platform::errors::InvalidArgument("The expanded size cannot be zero."));
if (i < diff) {
PADDLE_ENFORCE_GT(
expand_shape[i], 0,
platform::errors::InvalidArgument(
"The expanded size (%d) for non-existing dimensions must be "
"positive for expand operation.",
expand_shape[i]));
repeat_times[i] = expand_shape[i];
} else if (expand_shape[i] > 0) {
if (vec_in_dims[i] != 1) {
PADDLE_ENFORCE_EQ(
vec_in_dims[i], expand_shape[i],
platform::errors::InvalidArgument(
"The value (%d) of the non-singleton dimension does not match"
" the corresponding value (%d) in shape for expand operation.",
vec_in_dims[i], expand_shape[i]));
repeat_times[i] = 1;
} else {
repeat_times[i] = expand_shape[i];
}
} else {
PADDLE_ENFORCE_EQ(
expand_shape[i], -1,
platform::errors::InvalidArgument(
"When the value in shape is negative for expand_v2 op, "
"only -1 is supported, but the value received is %d.",
expand_shape[i]));
repeat_times[i] = 1;
}
}
Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
for (size_t i = 0; i < repeat_times.size(); ++i) {
bcast_dims[i] = repeat_times[i];
}
framework::DDim new_in_dims = framework::make_ddim(vec_in_dims);
framework::DDim out_dims(new_in_dims);
for (size_t i = 0; i < repeat_times.size(); ++i) {
out_dims[i] *= repeat_times[i];
}
out0->Resize(out_dims);
auto x = EigenTensor<T, Rank>::From(arg1, new_in_dims);
out0->mutable_data<T>(context.GetPlace());
auto y = EigenTensor<T, Rank>::From(*out0, out_dims);
auto& place =
*context.template device_context<DeviceContext>().eigen_device();
// use 32-bit index to speed up
bool use_32bit_index = y.size() < Eigen::NumTraits<int>::highest();
if (use_32bit_index) {
EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(
place, To32BitIndex(y), To32BitIndex(x), bcast_dims);
} else {
EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(place, y, x,
bcast_dims);
}
}
template <typename DeviceContext, typename T>
static void linalg_solve(const framework::ExecutionContext& context,
const framework::Tensor* x, const framework::Tensor* y,
framework::Tensor* out) {
out->mutable_data<T>(context.GetPlace());
auto& dev_ctx = context.template device_context<DeviceContext>();
math::MatrixSolveFunctor<DeviceContext, T> mat_solve;
// input y can be vector or matrix
// but need to be unsqueezed if y is a vector
bool is_vector = false;
is_vector = is_vector_rhs(*x, *y);
Tensor tmp_y;
if (is_vector) {
tmp_y.mutable_data(context.GetPlace(), y->type());
to_unsqueeze(context, *y, &tmp_y);
} else {
tmp_y.Resize(y->dims());
tmp_y.mutable_data(context.GetPlace(), y->type());
framework::TensorCopy(
*y, context.GetPlace(),
context.template device_context<platform::DeviceContext>(), &tmp_y);
}
Tensor tmp_x;
tmp_x.Resize(x->dims());
tmp_x.mutable_data(context.GetPlace(), x->type());
framework::TensorCopy(
*x, context.GetPlace(),
context.template device_context<platform::DeviceContext>(), &tmp_x);
std::vector<int64_t> x_broadcast_dims;
std::vector<int64_t> y_broadcast_dims;
std::tie(x_broadcast_dims, y_broadcast_dims) =
_broadcast_batch_dims(tmp_x, tmp_y);
expand_check(tmp_x, x_broadcast_dims);
expand_check(tmp_y, y_broadcast_dims);
Tensor tmp_x_bc;
Tensor tmp_y_bc;
auto tmp_x_rank = tmp_x.dims().size();
auto tmp_y_rank = tmp_y.dims().size();
auto rank_0 = std::max(tmp_x_rank, static_cast<int>(x_broadcast_dims.size()));
switch (rank_0) {
case 1:
tensor_expand<1, T, DeviceContext>(context, tmp_x, &tmp_x_bc,
x_broadcast_dims);
break;
case 2:
tensor_expand<2, T, DeviceContext>(context, tmp_x, &tmp_x_bc,
x_broadcast_dims);
break;
case 3:
tensor_expand<3, T, DeviceContext>(context, tmp_x, &tmp_x_bc,
x_broadcast_dims);
break;
case 4:
tensor_expand<4, T, DeviceContext>(context, tmp_x, &tmp_x_bc,
x_broadcast_dims);
break;
case 5:
tensor_expand<5, T, DeviceContext>(context, tmp_x, &tmp_x_bc,
x_broadcast_dims);
break;
case 6:
tensor_expand<6, T, DeviceContext>(context, tmp_x, &tmp_x_bc,
x_broadcast_dims);
break;
}
auto rank_1 = std::max(tmp_y_rank, static_cast<int>(y_broadcast_dims.size()));
switch (rank_1) {
case 1:
tensor_expand<1, T, DeviceContext>(context, tmp_y, &tmp_y_bc,
y_broadcast_dims);
break;
case 2:
tensor_expand<2, T, DeviceContext>(context, tmp_y, &tmp_y_bc,
y_broadcast_dims);
break;
case 3:
tensor_expand<3, T, DeviceContext>(context, tmp_y, &tmp_y_bc,
y_broadcast_dims);
break;
case 4:
tensor_expand<4, T, DeviceContext>(context, tmp_y, &tmp_y_bc,
y_broadcast_dims);
break;
case 5:
tensor_expand<5, T, DeviceContext>(context, tmp_y, &tmp_y_bc,
y_broadcast_dims);
break;
case 6:
tensor_expand<6, T, DeviceContext>(context, tmp_y, &tmp_y_bc,
y_broadcast_dims);
break;
}
auto x_dim = x->dims();
auto y_dim = y->dims();
auto x_dim_size = x_dim.size();
auto y_dim_size = y_dim.size();
if (is_vector) { // vector case
out->Resize(tmp_y_bc.dims()); // out.unsqueeze(-1)
mat_solve(dev_ctx, tmp_x_bc, tmp_y_bc, out);
Tensor out_tmp;
out_tmp.Resize(out->dims());
out_tmp = *out;
to_squeeze(context, out_tmp, out); // out.squeeze(-1)
} else {
PADDLE_ENFORCE_EQ(
x_dim[x_dim_size - 1], y_dim[y_dim_size - 2],
platform::errors::InvalidArgument(
"Matrix X1 with dimension greater than 2 and any matrix Y1,"
"the matrix X1's width must be equal with matrix Y1's "
"height. But received X's shape = [%s], X1's shape = [%s], X1's "
"width = %s; Y's shape = [%s], Y1's shape = [%s], Y1's height = "
"%s.",
x_dim, x_dim, x_dim[x_dim_size - 1], y_dim, y_dim,
y_dim[y_dim_size - 2]));
mat_solve(dev_ctx, tmp_x_bc, tmp_y_bc, out);
}
}
// for TransposeNormal
static std::vector<int> getNewAxis(const int b_rank) {
std::vector<int> axis_1 = {0};
std::vector<int> axis_2 = {1, 0};
std::vector<int> axis_3 = {0, 2, 1};
std::vector<int> axis_4 = {0, 1, 3, 2};
std::vector<int> axis_5 = {0, 1, 2, 4, 3};
std::vector<int> axis_6 = {0, 1, 2, 3, 5, 4};
std::vector<int> axis_7 = {0, 1, 2, 3, 4, 6, 5};
std::vector<int> axis_8 = {0, 1, 2, 3, 4, 5, 7, 6};
std::vector<int> axis_9 = {0, 1, 2, 3, 4, 5, 6, 8, 7};
switch (b_rank) {
case 1:
return axis_1;
break;
case 2:
return axis_2;
break;
case 3:
return axis_3;
break;
case 4:
return axis_4;
break;
case 5:
return axis_5;
break;
case 6:
return axis_6;
break;
case 7:
return axis_7;
break;
case 8:
return axis_8;
break;
default:
return axis_9;
}
}
// for Resize
static std::vector<int64_t> getNewDimsVec(const DDim& b_dims) {
std::vector<int64_t> b_dims_vec = paddle::framework::vectorize(b_dims);
int size = b_dims_vec.size();
if (size >= 2) {
// swap the last 2 elements in b_dims_vec
int64_t temp = b_dims_vec[size - 1];
b_dims_vec[size - 1] = b_dims_vec[size - 2];
b_dims_vec[size - 2] = temp;
return b_dims_vec;
}
PADDLE_ENFORCE_NE(
b_dims_vec.empty(), true,
platform::errors::PreconditionNotMet(
"The size of tensor b must not be %d after getting new dims", 0));
// if b_dims_vec.size() == 1, just retun original vec
return b_dims_vec;
}
template <typename DeviceContext, typename T>
class SolveKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const auto* x = context.Input<framework::Tensor>("X");
const auto* y = context.Input<framework::Tensor>("Y");
Tensor* out = context.Output<framework::Tensor>("Out");
linalg_solve<DeviceContext, T>(context, x, y, out);
}
};
template <typename DeviceContext, typename T>
class SolveGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<framework::Tensor>("X");
auto* y = ctx.Input<framework::Tensor>("Y");
auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
// reuse the linalg.solve forward output
auto* out = ctx.Input<framework::Tensor>("Out");
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
bool is_vector = false;
is_vector = is_vector_rhs(*input, *y);
Tensor tmp_y;
if (is_vector) {
tmp_y.mutable_data(ctx.GetPlace(), y->type());
to_unsqueeze(ctx, *y, &tmp_y);
} else {
tmp_y.Resize(y->dims());
tmp_y.mutable_data(ctx.GetPlace(), y->type());
framework::TensorCopy(
*y, ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(), &tmp_y);
}
Tensor tmp_x;
tmp_x.Resize(input->dims());
tmp_x.mutable_data(ctx.GetPlace(), input->type());
framework::TensorCopy(
*input, ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(), &tmp_x);
std::vector<int64_t> x_broadcast_dims;
std::vector<int64_t> y_broadcast_dims;
std::tie(x_broadcast_dims, y_broadcast_dims) =
_broadcast_batch_dims(tmp_x, tmp_y);
// tmp_dx
Tensor tmp_dx;
tmp_dx.Resize(framework::make_ddim(x_broadcast_dims));
tmp_dx.mutable_data<T>(ctx.GetPlace());
// tmp_dy
Tensor tmp_dy;
tmp_dy.Resize(framework::make_ddim(y_broadcast_dims));
tmp_dy.mutable_data<T>(ctx.GetPlace());
Tensor tmp_input(input->type());
const auto& new_dims_vec = getNewDimsVec(input->dims());
tmp_input.Resize(framework::make_ddim(new_dims_vec));
tmp_input.mutable_data<T>(ctx.GetPlace());
math::TransposeNormal<DeviceContext, T> trans;
std::vector<int> new_axis = getNewAxis(input->dims().size());
auto& dev_ctx = ctx.template device_context<DeviceContext>();
trans(dev_ctx, *input, &tmp_input, new_axis);
if (dy) {
dy->mutable_data<T>(ctx.GetPlace());
// reuse linalg_solve forward logics to get tmp_dy
linalg_solve<DeviceContext, T>(ctx, &tmp_input, dout, &tmp_dy);
}
if (dx) {
dx->mutable_data<T>(ctx.GetPlace());
// to get dx
auto blas = math::GetBlas<DeviceContext, T>(ctx);
if (input->dims().size() == 2 && y->dims().size() == 2) {
auto mat_dim_a1 = math::CreateMatrixDescriptor(tmp_dy.dims(), 0, false);
auto mat_dim_b1 = math::CreateMatrixDescriptor(out->dims(), 0, true);
blas.MatMul(tmp_dy, mat_dim_a1, *out, mat_dim_b1, T(-1), &tmp_dx, T(0));
} else if (is_vector_rhs(*input, *y)) {
Tensor tmp_dy_;
tmp_dy_.mutable_data(ctx.GetPlace(), y->type());
to_unsqueeze(ctx, tmp_dy, &tmp_dy_);
Tensor tmp_out_;
tmp_out_.mutable_data(ctx.GetPlace(), out->type());
to_unsqueeze(ctx, *out, &tmp_out_);
auto mat_dim_a1 =
math::CreateMatrixDescriptor(tmp_dy_.dims(), 0, false);
auto mat_dim_b1 =
math::CreateMatrixDescriptor(tmp_out_.dims(), 0, true);
blas.MatMul(tmp_dy_, mat_dim_a1, tmp_out_, mat_dim_b1, T(-1), &tmp_dx,
T(0));
} else {
auto mat_dim_a1 = math::CreateMatrixDescriptor(tmp_dy.dims(), 0, false);
auto mat_dim_b1 = math::CreateMatrixDescriptor(out->dims(), 0, true);
blas.MatMul(tmp_dy, mat_dim_a1, *out, mat_dim_b1, T(-1), &tmp_dx, T(0));
}
}
if (y->dims() != tmp_dy.dims()) {
Tensor dy_help;
dy_help.Resize(tmp_dy.dims());
dy_help.mutable_data(ctx.GetPlace(), tmp_dy.type());
framework::TensorCopy(
tmp_dy, ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(), &dy_help);
// get dims
std::vector<std::int64_t> x_dims = vectorize(input->dims());
std::vector<std::int64_t> y_dims = vectorize(y->dims());
std::vector<std::int64_t> dout_dims = vectorize(dout->dims());
if (is_vector_rhs(*input, *y)) {
dout_dims.push_back(1);
}
int y_ndim = y_dims.size();
int ndim = dout_dims.size();
const std::vector<std::int64_t> dy_help_dims = vectorize(dy_help.dims());
std::vector<std::int64_t> dy_broadcast_dims(ndim);
std::fill(dy_broadcast_dims.data(),
dy_broadcast_dims.data() + ndim - y_ndim, 1);
std::copy(y_dims.data(), y_dims.data() + y_ndim,
dy_broadcast_dims.data() + ndim - y_ndim);
std::vector<int> dy_reduce_dims;
for (int idx = 0; idx <= ndim - 3; idx++) {
if (dy_help_dims[idx] != 1 && dy_broadcast_dims[idx] == 1) {
dy_reduce_dims.push_back(idx);
}
}
// reduce sum to get grad by ReduceSum
if (dy) {
if (dy_reduce_dims.empty()) {
*dy = std::move(dy_help);
} else {
bool keep_dim = true;
if (dy_help.dims().size() != dy->dims().size()) {
keep_dim = false;
}
ReduceSumForSolveGrad<DeviceContext, T>(&dy_help, dy, dy_reduce_dims,
keep_dim, ctx);
}
dy->Resize(y->dims());
}
} else {
framework::TensorCopy(
tmp_dy, ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(), dy);
}
if (input->dims() != tmp_dx.dims()) {
Tensor dx_help;
dx_help.Resize(tmp_dx.dims());
dx_help.mutable_data(ctx.GetPlace(), tmp_dx.type());
framework::TensorCopy(
tmp_dx, ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(), &dx_help);
// get dims
std::vector<std::int64_t> x_dims = vectorize(input->dims());
std::vector<std::int64_t> y_dims = vectorize(y->dims());
int x_ndim = x_dims.size();
int ndim = x_broadcast_dims.size();
const std::vector<std::int64_t> dx_help_dims = vectorize(dx_help.dims());
std::vector<std::int64_t> dx_broadcast_dims(ndim);
std::fill(dx_broadcast_dims.data(),
dx_broadcast_dims.data() + ndim - x_ndim, 1);
std::copy(x_dims.data(), x_dims.data() + x_ndim,
dx_broadcast_dims.data() + ndim - x_ndim);
std::vector<int> dx_reduce_dims;
for (int idx = 0; idx <= ndim - 3; idx++) {
if (dx_help_dims[idx] != 1 && dx_broadcast_dims[idx] == 1) {
dx_reduce_dims.push_back(idx);
}
}
// reduce sum to get grad by ReduceSum
if (dx) {
dx->mutable_data<T>(ctx.GetPlace());
if (dx_reduce_dims.empty()) {
*dx = std::move(dx_help);
} else {
bool keep_dim = true;
if (dx_help.dims().size() != dx->dims().size()) {
keep_dim = false;
}
ReduceSumForSolveGrad<DeviceContext, T>(&dx_help, dx, dx_reduce_dims,
keep_dim, ctx);
}
dx->Resize(input->dims());
}
} else {
framework::TensorCopy(
tmp_dx, ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(), dx);
}
}
};
} // namespace operators
} // namespace paddle
......@@ -89,7 +89,9 @@ extern void *cublas_dso_handle;
__macro(cublasDgetrfBatched); \
__macro(cublasDgetriBatched); \
__macro(cublasSmatinvBatched); \
__macro(cublasDmatinvBatched);
__macro(cublasDmatinvBatched); \
__macro(cublasSgetrsBatched); \
__macro(cublasDgetrsBatched);
CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
......
......@@ -108,6 +108,7 @@ from .tensor.linalg import matrix_power # noqa: F401
from .tensor.linalg import svd # noqa: F401
from .tensor.linalg import eigh # noqa: F401
from .tensor.linalg import pinv # noqa: F401
from .tensor.linalg import solve # noqa: F401
from .tensor.logic import equal # noqa: F401
from .tensor.logic import greater_equal # noqa: F401
from .tensor.logic import greater_than # noqa: F401
......
......@@ -967,6 +967,7 @@ set_tests_properties(test_dataloader_unkeep_order PROPERTIES TIMEOUT 120)
set_tests_properties(test_reader_reset PROPERTIES TIMEOUT 120)
set_tests_properties(test_pool3d_api PROPERTIES TIMEOUT 120)
set_tests_properties(test_cumprod_op PROPERTIES TIMEOUT 120)
set_tests_properties(test_solve_op PROPERTIES TIMEOUT 120)
if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL)
set_tests_properties(test_parallel_dygraph_dataparallel PROPERTIES TIMEOUT 120)
set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT 120)
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.w
from __future__ import print_function
import unittest
import numpy as np
import paddle
import paddle.fluid.core as core
import sys
sys.path.append("..")
from op_test import OpTest
import paddle.fluid as fluid
from paddle.fluid import Program, program_guard
# 2D normal case
class TestSolveOp(OpTest):
def config(self):
self.input_x_matrix_shape = [15, 15]
self.input_y_matrix_shape = [15, 10]
self.dtype = "float64"
def setUp(self):
paddle.enable_static()
self.config()
self.op_type = "solve"
np.random.seed(2021)
self.inputs = {
'X': np.random.random(self.input_x_matrix_shape).astype(self.dtype),
'Y': np.random.random(self.input_y_matrix_shape).astype(self.dtype)
}
self.outputs = {
'Out': np.linalg.solve(self.inputs['X'], self.inputs['Y'])
}
def test_check_output(self):
self.check_output()
def test_check_grad_normal(self):
self.check_grad(['X', 'Y'], 'Out')
# x broadcast + 3D batch case
class TestSolveOpBatched_case0(OpTest):
def setUp(self):
self.op_type = "solve"
self.dtype = "float64"
np.random.seed(2021)
self.inputs = {
'X': np.random.random((11, 11)).astype(self.dtype),
'Y': np.random.random((2, 11, 7)).astype(self.dtype)
}
result = np.linalg.solve(self.inputs['X'], self.inputs['Y'])
self.outputs = {'Out': result}
def test_check_output(self):
self.check_output()
def test_check_grad_normal(self):
self.check_grad(['X', 'Y'], 'Out', max_relative_error=1e-1)
# 3D batch + y vector case
class TestSolveOpBatched_case1(OpTest):
def setUp(self):
self.op_type = "solve"
self.dtype = "float64"
np.random.seed(2021)
self.inputs = {
'X': np.random.random((20, 6, 6)).astype(self.dtype),
'Y': np.random.random((20, 6)).astype(self.dtype)
}
result = np.linalg.solve(self.inputs['X'], self.inputs['Y'])
self.outputs = {'Out': result}
def test_check_output(self):
self.check_output()
def test_check_grad_normal(self):
self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.04)
# 3D batch + y broadcast case
class TestSolveOpBatched_case2(OpTest):
def setUp(self):
self.op_type = "solve"
self.dtype = "float64"
np.random.seed(2021)
self.inputs = {
'X': np.random.random((2, 10, 10)).astype(self.dtype),
'Y': np.random.random((1, 10, 10)).astype(self.dtype)
}
result = np.linalg.solve(self.inputs['X'], self.inputs['Y'])
self.outputs = {'Out': result}
def test_check_output(self):
self.check_output()
def test_check_grad_normal(self):
self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.02)
# x broadcast + 3D batch case
class TestSolveOpBatched_case3(OpTest):
def setUp(self):
self.op_type = "solve"
self.dtype = "float64"
np.random.seed(2021)
self.inputs = {
'X': np.random.random((1, 10, 10)).astype(self.dtype),
'Y': np.random.random((2, 10, 10)).astype(self.dtype)
}
result = np.linalg.solve(self.inputs['X'], self.inputs['Y'])
self.outputs = {'Out': result}
def test_check_output(self):
self.check_output()
def test_check_grad_normal(self):
self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.02)
# 3D normal batch case
class TestSolveOpBatched_case4(OpTest):
def setUp(self):
self.op_type = "solve"
self.dtype = "float64"
np.random.seed(2021)
self.inputs = {
'X': np.random.random((3, 6, 6)).astype(self.dtype),
'Y': np.random.random((3, 6, 7)).astype(self.dtype)
}
result = np.linalg.solve(self.inputs['X'], self.inputs['Y'])
self.outputs = {'Out': result}
def test_check_output(self):
self.check_output()
def test_check_grad_normal(self):
self.check_grad(['X', 'Y'], 'Out')
# 4D normal batch case
class TestSolveOpBatched_case5(OpTest):
def setUp(self):
self.op_type = "solve"
self.dtype = "float64"
np.random.seed(2021)
self.inputs = {
'X': np.random.random((2, 2, 6, 6)).astype(self.dtype),
'Y': np.random.random((2, 2, 6, 6)).astype(self.dtype)
}
result = np.linalg.solve(self.inputs['X'], self.inputs['Y'])
self.outputs = {'Out': result}
def test_check_output(self):
self.check_output()
def test_check_grad_normal(self):
self.check_grad(['X', 'Y'], 'Out')
# 4D batch + y broadcast case
class TestSolveOpBatched_case6(OpTest):
def setUp(self):
self.op_type = "solve"
self.dtype = "float64"
np.random.seed(2021)
self.inputs = {
'X': np.random.random((2, 2, 6, 6)).astype(self.dtype),
'Y': np.random.random((1, 2, 6, 9)).astype(self.dtype)
}
result = np.linalg.solve(self.inputs['X'], self.inputs['Y'])
self.outputs = {'Out': result}
def test_check_output(self):
self.check_output()
def test_check_grad_normal(self):
self.check_grad(['X', 'Y'], 'Out')
# 5D normal batch case
class TestSolveOpBatched_case7(OpTest):
def setUp(self):
self.op_type = "solve"
self.dtype = "float64"
np.random.seed(2021)
self.inputs = {
'X': np.random.random((2, 2, 2, 4, 4)).astype(self.dtype),
'Y': np.random.random((2, 2, 2, 4, 4)).astype(self.dtype)
}
result = np.linalg.solve(self.inputs['X'], self.inputs['Y'])
self.outputs = {'Out': result}
def test_check_output(self):
self.check_output()
def test_check_grad_normal(self):
self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.04)
# 5D batch + y broadcast case
class TestSolveOpBatched_case8(OpTest):
def setUp(self):
self.op_type = "solve"
self.dtype = "float64"
np.random.seed(2021)
self.inputs = {
'X': np.random.random((2, 2, 2, 4, 4)).astype(self.dtype),
'Y': np.random.random((1, 2, 2, 4, 7)).astype(self.dtype)
}
result = np.linalg.solve(self.inputs['X'], self.inputs['Y'])
self.outputs = {'Out': result}
def test_check_output(self):
self.check_output()
def test_check_grad_normal(self):
self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.04)
class TestSolveOpError(unittest.TestCase):
def test_errors(self):
with program_guard(Program(), Program()):
# The input type of solve_op must be Variable.
x1 = fluid.create_lod_tensor(
np.array([[-1]]), [[1]], fluid.CPUPlace())
y1 = fluid.create_lod_tensor(
np.array([[-1]]), [[1]], fluid.CPUPlace())
self.assertRaises(TypeError, paddle.linalg.solve, x1, y1)
# The data type of input must be float32 or float64.
x2 = fluid.data(name="x2", shape=[30, 30], dtype="bool")
y2 = fluid.data(name="y2", shape=[30, 10], dtype="bool")
self.assertRaises(TypeError, paddle.linalg.solve, x2, y2)
x3 = fluid.data(name="x3", shape=[30, 30], dtype="int32")
y3 = fluid.data(name="y3", shape=[30, 10], dtype="int32")
self.assertRaises(TypeError, paddle.linalg.solve, x3, y3)
x4 = fluid.data(name="x4", shape=[30, 30], dtype="int64")
y4 = fluid.data(name="y4", shape=[30, 10], dtype="int64")
self.assertRaises(TypeError, paddle.linalg.solve, x4, y4)
x5 = fluid.data(name="x5", shape=[30, 30], dtype="float16")
y5 = fluid.data(name="y5", shape=[30, 10], dtype="float16")
self.assertRaises(TypeError, paddle.linalg.solve, x5, y5)
# The number of dimensions of input'X must be >= 2.
x6 = fluid.data(name="x6", shape=[30], dtype="float64")
y6 = fluid.data(name="y6", shape=[30], dtype="float64")
self.assertRaises(ValueError, paddle.linalg.solve, x6, y6)
# The inner-most 2 dimensions of input'X should be equal to each other
x7 = fluid.data(name="x7", shape=[2, 3, 4], dtype="float64")
y7 = fluid.data(name="y7", shape=[2, 4, 3], dtype="float64")
self.assertRaises(ValueError, paddle.linalg.solve, x7, y7)
# 2D + vector case, FP64
class TestSolveOpAPI_1(unittest.TestCase):
def setUp(self):
np.random.seed(2021)
self.place = [paddle.CPUPlace()]
self.dtype = "float64"
if core.is_compiled_with_cuda():
self.place.append(paddle.CUDAPlace(0))
def check_static_result(self, place):
with fluid.program_guard(fluid.Program(), fluid.Program()):
paddle_input_x = fluid.data(
name="input_x", shape=[3, 3], dtype=self.dtype)
paddle_input_y = fluid.data(
name="input_y", shape=[3], dtype=self.dtype)
paddle_result = paddle.linalg.solve(paddle_input_x, paddle_input_y)
np_input_x = np.random.random([3, 3]).astype(self.dtype)
np_input_y = np.random.random([3]).astype(self.dtype)
np_result = np.linalg.solve(np_input_x, np_input_y)
exe = fluid.Executor(place)
fetches = exe.run(
fluid.default_main_program(),
feed={"input_x": np_input_x,
"input_y": np_input_y},
fetch_list=[paddle_result])
self.assertTrue(
np.allclose(fetches[0], np.linalg.solve(np_input_x,
np_input_y)))
def test_static(self):
for place in self.place:
self.check_static_result(place=place)
def test_dygraph(self):
def run(place):
paddle.disable_static(place)
np.random.seed(2021)
input_x_np = np.random.random([3, 3]).astype(self.dtype)
input_y_np = np.random.random([3]).astype(self.dtype)
tensor_input_x = paddle.to_tensor(input_x_np)
tensor_input_y = paddle.to_tensor(input_y_np)
numpy_output = np.linalg.solve(input_x_np, input_y_np)
paddle_output = paddle.linalg.solve(tensor_input_x, tensor_input_y)
self.assertEqual(
np.allclose(numpy_output, paddle_output.numpy()), True)
self.assertEqual(numpy_output.shape, paddle_output.numpy().shape)
paddle.enable_static()
for place in self.place:
run(place)
# 2D normal case, FP64
class TestSolveOpAPI_2(unittest.TestCase):
def setUp(self):
np.random.seed(2021)
self.place = [paddle.CPUPlace()]
self.dtype = "float64"
if core.is_compiled_with_cuda():
self.place.append(paddle.CUDAPlace(0))
def check_static_result(self, place):
paddle.enable_static()
with fluid.program_guard(fluid.Program(), fluid.Program()):
paddle_input_x = fluid.data(
name="input_x", shape=[10, 10], dtype=self.dtype)
paddle_input_y = fluid.data(
name="input_y", shape=[10, 4], dtype=self.dtype)
paddle_result = paddle.linalg.solve(paddle_input_x, paddle_input_y)
np_input_x = np.random.random([10, 10]).astype(self.dtype)
np_input_y = np.random.random([10, 4]).astype(self.dtype)
np_result = np.linalg.solve(np_input_x, np_input_y)
exe = fluid.Executor(place)
fetches = exe.run(
fluid.default_main_program(),
feed={"input_x": np_input_x,
"input_y": np_input_y},
fetch_list=[paddle_result])
self.assertTrue(
np.allclose(fetches[0], np.linalg.solve(np_input_x,
np_input_y)))
def test_static(self):
for place in self.place:
self.check_static_result(place=place)
def test_dygraph(self):
def run(place):
paddle.disable_static(place)
np.random.seed(2021)
input_x_np = np.random.random([10, 10]).astype(self.dtype)
input_y_np = np.random.random([10, 4]).astype(self.dtype)
tensor_input_x = paddle.to_tensor(input_x_np)
tensor_input_y = paddle.to_tensor(input_y_np)
numpy_output = np.linalg.solve(input_x_np, input_y_np)
paddle_output = paddle.linalg.solve(tensor_input_x, tensor_input_y)
self.assertEqual(
np.allclose(numpy_output, paddle_output.numpy()), True)
self.assertEqual(numpy_output.shape, paddle_output.numpy().shape)
paddle.enable_static()
for place in self.place:
run(place)
# 2D normal case, FP32
class TestSolveOpAPI_3(unittest.TestCase):
def setUp(self):
np.random.seed(2021)
self.place = [paddle.CPUPlace()]
self.dtype = "float32"
if core.is_compiled_with_cuda():
self.place.append(paddle.CUDAPlace(0))
def check_static_result(self, place):
paddle.enable_static()
with fluid.program_guard(fluid.Program(), fluid.Program()):
paddle_input_x = fluid.data(
name="input_x", shape=[10, 10], dtype=self.dtype)
paddle_input_y = fluid.data(
name="input_y", shape=[10, 4], dtype=self.dtype)
paddle_result = paddle.linalg.solve(paddle_input_x, paddle_input_y)
np_input_x = np.random.random([10, 10]).astype(self.dtype)
np_input_y = np.random.random([10, 4]).astype(self.dtype)
np_result = np.linalg.solve(np_input_x, np_input_y)
exe = fluid.Executor(place)
fetches = exe.run(
fluid.default_main_program(),
feed={"input_x": np_input_x,
"input_y": np_input_y},
fetch_list=[paddle_result])
self.assertTrue(
np.allclose(
fetches[0],
np.linalg.solve(np_input_x, np_input_y),
rtol=1.e-4))
def test_static(self):
for place in self.place:
self.check_static_result(place=place)
def test_dygraph(self):
def run(place):
paddle.disable_static(place)
np.random.seed(2021)
input_x_np = np.random.random([10, 10]).astype(self.dtype)
input_y_np = np.random.random([10, 4]).astype(self.dtype)
tensor_input_x = paddle.to_tensor(input_x_np)
tensor_input_y = paddle.to_tensor(input_y_np)
numpy_output = np.linalg.solve(input_x_np, input_y_np)
paddle_output = paddle.linalg.solve(tensor_input_x, tensor_input_y)
self.assertEqual(
np.allclose(
numpy_output, paddle_output.numpy(), rtol=1.e-4),
True)
self.assertEqual(numpy_output.shape, paddle_output.numpy().shape)
paddle.enable_static()
for place in self.place:
run(place)
# 3D + y broadcast case, FP64
class TestSolveOpAPI_4(unittest.TestCase):
def setUp(self):
np.random.seed(2021)
self.place = [paddle.CPUPlace()]
self.dtype = "float64"
if core.is_compiled_with_cuda():
self.place.append(paddle.CUDAPlace(0))
def check_static_result(self, place):
with fluid.program_guard(fluid.Program(), fluid.Program()):
paddle_input_x = fluid.data(
name="input_x", shape=[2, 3, 3], dtype=self.dtype)
paddle_input_y = fluid.data(
name="input_y", shape=[1, 3, 3], dtype=self.dtype)
paddle_result = paddle.linalg.solve(paddle_input_x, paddle_input_y)
np_input_x = np.random.random([2, 3, 3]).astype(self.dtype)
np_input_y = np.random.random([1, 3, 3]).astype(self.dtype)
np_result = np.linalg.solve(np_input_x, np_input_y)
exe = fluid.Executor(place)
fetches = exe.run(
fluid.default_main_program(),
feed={"input_x": np_input_x,
"input_y": np_input_y},
fetch_list=[paddle_result])
self.assertTrue(
np.allclose(fetches[0], np.linalg.solve(np_input_x,
np_input_y)))
def test_static(self):
for place in self.place:
self.check_static_result(place=place)
def test_dygraph(self):
def run(place):
paddle.disable_static(place)
np.random.seed(2021)
input_x_np = np.random.random([2, 3, 3]).astype(self.dtype)
input_y_np = np.random.random([1, 3, 3]).astype(self.dtype)
tensor_input_x = paddle.to_tensor(input_x_np)
tensor_input_y = paddle.to_tensor(input_y_np)
numpy_output = np.linalg.solve(input_x_np, input_y_np)
paddle_output = paddle.linalg.solve(tensor_input_x, tensor_input_y)
self.assertEqual(
np.allclose(numpy_output, paddle_output.numpy()), True)
self.assertEqual(numpy_output.shape, paddle_output.numpy().shape)
paddle.enable_static()
for place in self.place:
run(place)
class TestSolveOpSingularAPI(unittest.TestCase):
# Singular matrix is ​​not invertible
def setUp(self):
self.places = [fluid.CPUPlace()]
self.dtype = "float64"
if core.is_compiled_with_cuda():
self.places.append(fluid.CUDAPlace(0))
def check_static_result(self, place):
with fluid.program_guard(fluid.Program(), fluid.Program()):
x = fluid.data(name="x", shape=[4, 4], dtype=self.dtype)
y = fluid.data(name="y", shape=[4, 4], dtype=self.dtype)
result = paddle.linalg.solve(x, y)
input_x_np = np.ones([4, 4]).astype(self.dtype)
input_y_np = np.ones([4, 4]).astype(self.dtype)
exe = fluid.Executor(place)
try:
fetches = exe.run(fluid.default_main_program(),
feed={"x": input_x_np,
"y": input_y_np},
fetch_list=[result])
except RuntimeError as ex:
print("The mat is singular")
pass
except ValueError as ex:
print("The mat is singular")
pass
def test_static(self):
for place in self.places:
paddle.enable_static()
self.check_static_result(place=place)
def test_dygraph(self):
for place in self.places:
with fluid.dygraph.guard(place):
input_x_np = np.ones([4, 4]).astype(self.dtype)
input_y_np = np.ones([4, 4]).astype(self.dtype)
input_x = fluid.dygraph.to_variable(input_x_np)
input_y = fluid.dygraph.to_variable(input_y_np)
try:
result = paddle.linalg.solve(input_x, input_y)
except RuntimeError as ex:
print("The mat is singular")
pass
except ValueError as ex:
print("The mat is singular")
pass
if __name__ == "__main__":
unittest.main()
......@@ -48,6 +48,7 @@ NEED_FIX_FP64_CHECK_GRAD_THRESHOLD_OP_LIST = [
'lgamma', \
'svd', \
'matrix_power', \
'solve', \
]
NEED_FIX_FP64_CHECK_OUTPUT_THRESHOLD_OP_LIST = ['bilinear_interp',\
......
......@@ -16,6 +16,7 @@ from .tensor.linalg import cholesky # noqa: F401
from .tensor.linalg import norm # noqa: F401
from .tensor.linalg import cond # noqa: F401
from .tensor.linalg import matrix_power # noqa: F401
from .tensor.linalg import solve # noqa: F401
from .tensor import inverse as inv # noqa: F401
from .tensor.linalg import eigvals # noqa: F401
from .tensor.linalg import multi_dot # noqa: F401
......@@ -39,5 +40,6 @@ __all__ = [
'det',
'slogdet',
'eigh',
'pinv'
'pinv',
'solve'
]
......@@ -51,6 +51,7 @@ from .linalg import multi_dot # noqa: F401
from .linalg import svd # noqa: F401
from .linalg import eigh # noqa: F401
from .linalg import pinv # noqa: F401
from .linalg import solve # noqa: F401
from .logic import equal # noqa: F401
from .logic import greater_equal # noqa: F401
from .logic import greater_than # noqa: F401
......@@ -386,6 +387,7 @@ tensor_method_func = [ #noqa
'bitwise_not',
'broadcast_tensors',
'uniform_',
'solve',
]
#this list used in math_op_patch.py for magic_method bind
......
......@@ -2072,3 +2072,60 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
attrs={'trans_x': False,
'trans_y': True}, )
return out_2
def solve(x, y, name=None):
r"""
Computes the solution of a square system of linear equations with a unique solution for input 'X' and 'Y'.
Let :math: `X` be a sqaure matrix or a batch of square matrices, :math:`Y` be
a vector/matrix or a batch of vectors/matrices, the equation should be:
.. math::
Out = X^-1 * Y
Specifically,
- This system of linear equations has one solution if and only if input 'X' is invertible.
Args:
x (Tensor): A square matrix or a batch of square matrices. Its shape should be `[*, M, M]`, where `*` is zero or
more batch dimensions. Its data type should be float32 or float64.
y (Tensor): A vector/matrix or a batch of vectors/matrices. Its shape should be `[*, M, K]`, where `*` is zero or
more batch dimensions. Its data type should be float32 or float64.
name(str, optional): Name for the operation (optional, default is None).
For more information, please refer to :ref:`api_guide_Name`.
Returns:
Tensor: The solution of a square system of linear equations with a unique solution for input 'x' and 'y'.
Its data type should be the same as that of `x`.
Examples:
.. code-block:: python
# a square system of linear equations:
# 2*X0 + X1 = 9
# X0 + 2*X1 = 8
import paddle
import numpy as np
np_x = np.array([[3, 1],[1, 2]])
np_y = np.array([9, 8])
x = paddle.to_tensor(np_x, dtype="float64")
y = paddle.to_tensor(np_y, dtype="float64")
out = paddle.linalg.solve(x, y)
print(out)
# [2., 3.])
"""
if in_dygraph_mode():
return _C_ops.solve(x, y)
inputs = {"X": [x], "Y": [y]}
helper = LayerHelper("solve", **locals())
check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'solve')
check_variable_and_dtype(y, 'y', ['float32', 'float64'], 'solve')
out = helper.create_variable_for_type_inference(dtype=x.dtype)
helper.append_op(
type="solve", inputs={"X": x,
"Y": y}, outputs={"Out": out})
return out
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册