未验证 提交 ecfddebb 编写于 作者: Y Yiqun Liu 提交者: GitHub

Add the implementation of inverse (#23310)

上级 34122e66
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include <algorithm> #include <algorithm>
#include <limits> #include <limits>
......
...@@ -85,7 +85,7 @@ endif() ...@@ -85,7 +85,7 @@ endif()
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor device_memory_aligment) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor device_memory_aligment)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc matrix_inverse)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} box_wrapper) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} box_wrapper)
if (WITH_GPU) if (WITH_GPU)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu bert_encoder_functor) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu bert_encoder_functor)
......
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/inverse_op.h"
#include <string>
#include <unordered_map>
namespace paddle {
namespace operators {
class InverseOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "Inverse");
OP_INOUT_CHECK(ctx->HasOutput("Output"), "Output", "Output", "Inverse");
auto input_dims = ctx->GetInputDim("Input");
int64_t input_rank = input_dims.size();
PADDLE_ENFORCE_GE(
input_rank, 2,
platform::errors::InvalidArgument(
"The dimension of Input(Input) is expected to be no less than 2. "
"But recieved: Input(Input)'s dimension = %d, shape = [%s].",
input_rank, input_dims));
if (input_dims[input_rank - 2] > 0 && input_dims[input_rank - 1] > 0) {
PADDLE_ENFORCE_EQ(input_dims[input_rank - 2], input_dims[input_rank - 1],
platform::errors::InvalidArgument(
"The last two dimensions are expected to be equal. "
"But recieved: %d and %d; "
"Input(Input)'s shape = [%s].",
input_dims[input_rank - 2],
input_dims[input_rank - 1], input_dims));
}
ctx->SetOutputDim("Output", input_dims);
ctx->ShareLoD("Input", /*->*/ "Output");
}
};
class InverseOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
protected:
std::unordered_map<std::string, std::string>& GetInputOutputWithSameType()
const override {
static std::unordered_map<std::string, std::string> m{
{"Input", /*->*/ "Output"}};
return m;
}
};
class InverseGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
auto input_grad = framework::GradVarName("Input");
auto output_grad = framework::GradVarName("Output");
OP_INOUT_CHECK(ctx->HasInput("Output"), "Input", "Output", "InverseGrad");
OP_INOUT_CHECK(ctx->HasInput(output_grad), "Input", output_grad,
"InverseGrad");
if (ctx->HasOutput(input_grad)) {
ctx->SetOutputDim(input_grad, ctx->GetInputDim(output_grad));
}
}
};
class InverseOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput(
"Input",
"(Tensor) A square matrix (2-D Tensor) or batches of square matrices"
" to inverse.");
AddOutput("Output", "(Tensor) The inverse of input matrix.");
AddComment(R"DOC(
Inverse Operator
Takes the inverse of the square matrix.
)DOC");
}
};
template <typename T>
class InverseGradOpMaker : public framework::SingleGradOpMaker<T> {
public:
using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
protected:
void Apply(GradOpPtr<T> grad) const override {
grad->SetType(this->ForwardOpType() + "_grad");
grad->SetInput("Output", this->Output("Output"));
grad->SetInput(framework::GradVarName("Output"),
this->OutputGrad("Output"));
grad->SetOutput(framework::GradVarName("Input"), this->InputGrad("Input"));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(inverse, ops::InverseOp, ops::InverseOpMaker,
ops::InverseOpInferVarType,
ops::InverseGradOpMaker<paddle::framework::OpDesc>,
ops::InverseGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(inverse_grad, ops::InverseGradOp);
REGISTER_OP_CPU_KERNEL(
inverse, ops::InverseKernel<paddle::platform::CPUDeviceContext, float>,
ops::InverseKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
inverse_grad,
ops::InverseGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::InverseGradKernel<paddle::platform::CPUDeviceContext, double>);
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/inverse_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
inverse, ops::InverseKernel<paddle::platform::CUDADeviceContext, float>,
ops::InverseKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
inverse_grad,
ops::InverseGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::InverseGradKernel<paddle::platform::CUDADeviceContext, double>);
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/matrix_inverse.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class InverseKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* input = context.Input<framework::Tensor>("Input");
auto* output = context.Output<framework::Tensor>("Output");
output->mutable_data<T>(context.GetPlace());
auto& dev_ctx = context.template device_context<DeviceContext>();
math::MatrixInverseFunctor<DeviceContext, T> mat_inv;
mat_inv(dev_ctx, *input, output);
}
};
template <typename DeviceContext, typename T>
class InverseGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* a_inv = context.Input<framework::Tensor>("Output");
auto* a_inv_grad =
context.Input<framework::Tensor>(framework::GradVarName("Output"));
auto* a_grad =
context.Output<framework::Tensor>(framework::GradVarName("Input"));
if (a_grad) {
a_grad->mutable_data<T>(context.GetPlace());
auto blas = math::GetBlas<DeviceContext, T>(context);
auto& dev_ctx = context.template device_context<DeviceContext>();
framework::Tensor tmp_out =
context.AllocateTmpTensor<T, DeviceContext>(a_inv->dims(), dev_ctx);
auto mat_dim_a0 =
math::CreateMatrixDescriptor(a_inv_grad->dims(), 0, false);
auto mat_dim_b0 = math::CreateMatrixDescriptor(a_inv->dims(), 0, true);
blas.MatMul(*a_inv_grad, mat_dim_a0, *a_inv, mat_dim_b0, T(1), &tmp_out,
T(0));
auto mat_dim_a1 = math::CreateMatrixDescriptor(a_inv->dims(), 0, true);
auto mat_dim_b1 = math::CreateMatrixDescriptor(tmp_out.dims(), 0, false);
blas.MatMul(*a_inv, mat_dim_a1, tmp_out, mat_dim_b1, T(-1), a_grad, T(0));
}
}
};
} // namespace operators
} // namespace paddle
...@@ -21,6 +21,9 @@ function(math_library TARGET) ...@@ -21,6 +21,9 @@ function(math_library TARGET)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
list(APPEND cu_srcs ${TARGET}.cu) list(APPEND cu_srcs ${TARGET}.cu)
endif() endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc)
list(APPEND cu_srcs ${TARGET}.cu.cc)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu)
list(APPEND hip_srcs ${TARGET}.hip.cu) list(APPEND hip_srcs ${TARGET}.hip.cu)
endif() endif()
...@@ -68,6 +71,7 @@ math_library(vol2col) ...@@ -68,6 +71,7 @@ math_library(vol2col)
math_library(prelu) math_library(prelu)
math_library(bert_encoder_functor) math_library(bert_encoder_functor)
math_library(tree2col DEPS math_function) math_library(tree2col DEPS math_function)
math_library(matrix_inverse)
cc_test(math_function_test SRCS math_function_test.cc DEPS math_function) cc_test(math_function_test SRCS math_function_test.cc DEPS math_function)
cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor) cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor)
......
...@@ -223,6 +223,19 @@ class Blas { ...@@ -223,6 +223,19 @@ class Blas {
CBLAS_DIAG diag, int M, int N, T alpha, const T* A, int lda, T* B, CBLAS_DIAG diag, int M, int N, T alpha, const T* A, int lda, T* B,
int ldb) const; int ldb) const;
#ifdef PADDLE_WITH_CUDA
template <typename T>
void BatchedGETRF(int n, T** a, int* ipiv, int* info, int batch_size) const;
template <typename T>
void BatchedGETRI(int n, const T** a, const int* ipiv, T** a_inv, int* info,
int batch_size) const;
template <typename T>
void BatchedMatInv(int n, const T** a, T** a_inv, int* info,
int batch_size) const;
#endif
private: private:
const DeviceContext& context_; const DeviceContext& context_;
}; };
...@@ -361,6 +374,23 @@ class BlasT : private Blas<DeviceContext> { ...@@ -361,6 +374,23 @@ class BlasT : private Blas<DeviceContext> {
Base()->template TRSM<T>(args...); Base()->template TRSM<T>(args...);
} }
#ifdef PADDLE_WITH_CUDA
template <typename... ARGS>
void BatchedGETRF(ARGS... args) const {
Base()->template BatchedGETRF<T>(args...);
}
template <typename... ARGS>
void BatchedGETRI(ARGS... args) const {
Base()->template BatchedGETRI<T>(args...);
}
template <typename... ARGS>
void BatchedMatInv(ARGS... args) const {
Base()->template BatchedMatInv<T>(args...);
}
#endif
private: private:
const Blas<DeviceContext>* Base() const { const Blas<DeviceContext>* Base() const {
return static_cast<const Blas<DeviceContext>*>(this); return static_cast<const Blas<DeviceContext>*>(this);
......
...@@ -93,6 +93,24 @@ struct CUBlas<float> { ...@@ -93,6 +93,24 @@ struct CUBlas<float> {
static void TRSM(ARGS... args) { static void TRSM(ARGS... args) {
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasStrsm(args...)); PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasStrsm(args...));
} }
template <typename... ARGS>
static void GETRF_BATCH(ARGS... args) {
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cublasSgetrfBatched(args...));
}
template <typename... ARGS>
static void GETRI_BATCH(ARGS... args) {
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cublasSgetriBatched(args...));
}
template <typename... ARGS>
static void MATINV_BATCH(ARGS... args) {
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cublasSmatinvBatched(args...));
}
}; };
template <> template <>
...@@ -141,6 +159,24 @@ struct CUBlas<double> { ...@@ -141,6 +159,24 @@ struct CUBlas<double> {
static void TRSM(ARGS... args) { static void TRSM(ARGS... args) {
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDtrsm(args...)); PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDtrsm(args...));
} }
template <typename... ARGS>
static void GETRF_BATCH(ARGS... args) {
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cublasDgetrfBatched(args...));
}
template <typename... ARGS>
static void GETRI_BATCH(ARGS... args) {
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cublasDgetriBatched(args...));
}
template <typename... ARGS>
static void MATINV_BATCH(ARGS... args) {
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cublasDmatinvBatched(args...));
}
}; };
template <> template <>
...@@ -446,6 +482,44 @@ void Blas<platform::CUDADeviceContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo, ...@@ -446,6 +482,44 @@ void Blas<platform::CUDADeviceContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
}); });
} }
template <>
template <typename T>
void Blas<platform::CUDADeviceContext>::BatchedGETRF(int n, T **a, int *ipiv,
int *info,
int batch_size) const {
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<T>::GETRF_BATCH(handle, n, a, n, ipiv, info, batch_size);
});
}
template <>
template <typename T>
void Blas<platform::CUDADeviceContext>::BatchedGETRI(int n, const T **a,
const int *ipiv, T **a_inv,
int *info,
int batch_size) const {
PADDLE_ENFORCE_NE(
a_inv, a,
platform::errors::InvalidArgument(
"cuBLAS fuction 'cublas<S/D>getrfBatched' cannot be executed "
"in-place. The memory space of output matrix (address: %p) cannot "
"overlap memory space of input matrix (address: %p).",
a_inv, a));
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<T>::GETRI_BATCH(handle, n, a, n, ipiv, a_inv, n, info, batch_size);
});
}
template <>
template <typename T>
void Blas<platform::CUDADeviceContext>::BatchedMatInv(int n, const T **a,
T **a_inv, int *info,
int batch_size) const {
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<T>::MATINV_BATCH(handle, n, a, n, a_inv, n, info, batch_size);
});
}
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/matrix_inverse.h"
#include "Eigen/Core"
#include "Eigen/LU"
#include "paddle/fluid/operators/math/blas.h"
namespace paddle {
namespace operators {
namespace math {
template <typename T>
class MatrixInverseFunctor<platform::CPUDeviceContext, T> {
using Matrix =
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
using EigenMatrixMap = Eigen::Map<Matrix>;
using ConstEigenMatrixMap = Eigen::Map<const Matrix>;
public:
void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& a, framework::Tensor* a_inv) {
const auto& mat_dims = a.dims();
const int rank = mat_dims.size();
int n = mat_dims[rank - 1];
int batch_size = rank > 2 ? a.numel() / (n * n) : 1;
const T* a_ptr = a.data<T>();
T* a_inv_ptr = a_inv->mutable_data<T>(context.GetPlace());
for (int i = 0; i < batch_size; ++i) {
ConstEigenMatrixMap mat(a_ptr + i * n * n, n, n);
EigenMatrixMap mat_inv(a_inv_ptr + i * n * n, n, n);
Eigen::PartialPivLU<Matrix> lu;
lu.compute(mat);
const T min_abs_pivot = lu.matrixLU().diagonal().cwiseAbs().minCoeff();
PADDLE_ENFORCE_GT(
min_abs_pivot, static_cast<T>(0),
platform::errors::InvalidArgument("Input is not invertible."));
mat_inv.noalias() = lu.inverse();
}
}
};
template class MatrixInverseFunctor<platform::CPUDeviceContext, float>;
template class MatrixInverseFunctor<platform::CPUDeviceContext, double>;
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/matrix_inverse.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/operators/math/blas.h"
namespace paddle {
namespace operators {
namespace math {
template <typename T>
class MatrixInverseFunctor<platform::CUDADeviceContext, T> {
public:
void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& a, framework::Tensor* a_inv) {
const auto& mat_dims = a.dims();
const int rank = mat_dims.size();
int n = mat_dims[rank - 1];
int batch_size = rank > 2 ? a.numel() / (n * n) : 1;
memory::allocation::AllocationPtr tmp_gpu_mat_data;
const T* gpu_mat = a.data<T>();
if (n >= 32) {
// Copy all elements of input matrix A to a temporary memory space to
// avoid being overriden by getrf.
tmp_gpu_mat_data = memory::Alloc(context, a.numel() * sizeof(T));
memory::Copy(boost::get<platform::CUDAPlace>(context.GetPlace()),
tmp_gpu_mat_data->ptr(),
boost::get<platform::CUDAPlace>(context.GetPlace()),
a.data<void>(), a.numel() * sizeof(T), context.stream());
gpu_mat = reinterpret_cast<const T*>(tmp_gpu_mat_data->ptr());
}
std::vector<const T*> cpu_ptrs(batch_size * 2);
for (int i = 0; i < batch_size; ++i) {
cpu_ptrs[i] = gpu_mat + i * n * n;
cpu_ptrs[i + batch_size] = a_inv->data<T>() + i * n * n;
}
// Copy the addresses of A and A_inv from host to device.
memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
memory::Alloc(context, cpu_ptrs.size() * sizeof(T*));
memory::Copy(boost::get<platform::CUDAPlace>(context.GetPlace()),
tmp_gpu_ptrs_data->ptr(), platform::CPUPlace(),
static_cast<void*>(cpu_ptrs.data()),
cpu_ptrs.size() * sizeof(T*), context.stream());
T** gpu_inv_ptrs =
reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()) + batch_size;
// Allocate device memory for info and pivots.
int num_ints = n < 32 ? batch_size : batch_size * (n + 1);
memory::allocation::AllocationPtr tmp_gpu_info_data =
memory::Alloc(context, num_ints * sizeof(int));
int* gpu_info_ptr = reinterpret_cast<int*>(tmp_gpu_info_data->ptr());
auto blas = math::GetBlas<platform::CUDADeviceContext, T>(context);
// This functions in cuBLAS is intended to be used for matrices of small
// sizes where the launch overhead is a significant factor.
// TODO(Xreki): call function in cusolver for large matrices.
if (n < 32) {
// cublas<S/D>matinvBatched is a short cut of cublas<S/D>getrfBatched
// plus cublas<S/D>getriBatched.
// However it only works if N is less than 32. If not, we need to
// go through cublas<S/D>getrfBatched and cublas<S/D>getriBatched.
blas.BatchedMatInv(n,
reinterpret_cast<const T**>(tmp_gpu_ptrs_data->ptr()),
gpu_inv_ptrs, gpu_info_ptr, batch_size);
} else {
// This function performs the LU factorization of each matrix A by the
// equation P * A = L * U. L and U are written back to original matrix A,
// and diagonal elements of L are discarded.
int* gpu_pivot_ptr =
reinterpret_cast<int*>(tmp_gpu_info_data->ptr()) + batch_size;
blas.BatchedGETRF(n, reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()),
gpu_pivot_ptr, gpu_info_ptr, batch_size);
blas.BatchedGETRI(n,
reinterpret_cast<const T**>(tmp_gpu_ptrs_data->ptr()),
gpu_pivot_ptr, gpu_inv_ptrs, gpu_info_ptr, batch_size);
}
}
};
template class MatrixInverseFunctor<platform::CUDADeviceContext, float>;
template class MatrixInverseFunctor<platform::CUDADeviceContext, double>;
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace operators {
namespace math {
template <typename DeviceContext, typename T>
class MatrixInverseFunctor {
public:
void operator()(const DeviceContext& context, const framework::Tensor& a,
framework::Tensor* a_inv);
};
} // namespace math
} // namespace operators
} // namespace paddle
...@@ -90,7 +90,9 @@ extern void *cublas_dso_handle; ...@@ -90,7 +90,9 @@ extern void *cublas_dso_handle;
__macro(cublasSgetrfBatched); \ __macro(cublasSgetrfBatched); \
__macro(cublasSgetriBatched); \ __macro(cublasSgetriBatched); \
__macro(cublasDgetrfBatched); \ __macro(cublasDgetrfBatched); \
__macro(cublasDgetriBatched); __macro(cublasDgetriBatched); \
__macro(cublasSmatinvBatched); \
__macro(cublasDmatinvBatched);
CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
......
...@@ -167,7 +167,7 @@ from .tensor.math import div #DEFINE_ALIAS ...@@ -167,7 +167,7 @@ from .tensor.math import div #DEFINE_ALIAS
from .tensor.math import add #DEFINE_ALIAS from .tensor.math import add #DEFINE_ALIAS
from .tensor.math import atan #DEFINE_ALIAS from .tensor.math import atan #DEFINE_ALIAS
from .tensor.math import logsumexp #DEFINE_ALIAS from .tensor.math import logsumexp #DEFINE_ALIAS
# from .tensor.math import inverse #DEFINE_ALIAS from .tensor.math import inverse #DEFINE_ALIAS
from .tensor.math import log1p #DEFINE_ALIAS from .tensor.math import log1p #DEFINE_ALIAS
from .tensor.math import erf #DEFINE_ALIAS from .tensor.math import erf #DEFINE_ALIAS
from .tensor.math import addcmul #DEFINE_ALIAS from .tensor.math import addcmul #DEFINE_ALIAS
......
...@@ -1221,9 +1221,9 @@ class OpTest(unittest.TestCase): ...@@ -1221,9 +1221,9 @@ class OpTest(unittest.TestCase):
def err_msg(): def err_msg():
offset = np.argmax(diff_mat > max_relative_error) offset = np.argmax(diff_mat > max_relative_error)
return ("%s error, %s variable %s max gradient diff %f over limit %f, " return ("Operator %s error, %s variable %s (shape: %s, dtype: %s) max gradient diff %e over limit %e, "
"the first error element is %d, expected %f, but got %f.") \ "the first error element is %d, expected %e, but got %e.") \
% (self.op_type, msg_prefix, name, max_diff, max_relative_error, % (self.op_type, msg_prefix, name, str(a.shape), self.dtype, max_diff, max_relative_error,
offset, a.flatten()[offset], b.flatten()[offset]) offset, a.flatten()[offset], b.flatten()[offset])
self.assertLessEqual(max_diff, max_relative_error, err_msg()) self.assertLessEqual(max_diff, max_relative_error, err_msg())
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import paddle.fluid as fluid
import paddle.fluid.core as core
import paddle
from op_test import OpTest
class TestInverseOp(OpTest):
def config(self):
self.matrix_shape = [10, 10]
self.dtype = "float64"
def setUp(self):
self.op_type = "inverse"
self.config()
np.random.seed(123)
mat = np.random.random(self.matrix_shape).astype(self.dtype)
inverse = np.linalg.inv(mat)
self.inputs = {'Input': mat}
self.outputs = {'Output': inverse}
def test_check_output(self):
self.check_output()
def test_grad(self):
self.check_grad(['Input'], 'Output')
class TestInverseOpBatched(TestInverseOp):
def config(self):
self.matrix_shape = [8, 4, 4]
self.dtype = "float64"
class TestInverseOpLarge(TestInverseOp):
def config(self):
self.matrix_shape = [32, 32]
self.dtype = "float64"
def test_grad(self):
self.check_grad(['Input'], 'Output', max_relative_error=1e-6)
class TestInverseOpFP32(TestInverseOp):
def config(self):
self.matrix_shape = [10, 10]
self.dtype = "float32"
def test_grad(self):
self.check_grad(['Input'], 'Output', max_relative_error=1e-2)
class TestInverseOpBatchedFP32(TestInverseOpFP32):
def config(self):
self.matrix_shape = [8, 4, 4]
self.dtype = "float32"
class TestInverseOpLargeFP32(TestInverseOpFP32):
def config(self):
self.matrix_shape = [32, 32]
self.dtype = "float32"
class TestInverseAPI(unittest.TestCase):
def setUp(self):
np.random.seed(123)
self.places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda():
self.places.append(fluid.CUDAPlace(0))
def check_static_result(self, place, with_out=False):
with fluid.program_guard(fluid.Program(), fluid.Program()):
input = fluid.data(name="input", shape=[4, 4], dtype="float64")
if with_out:
out = fluid.data(name="output", shape=[4, 4], dtype="float64")
else:
out = None
result = paddle.inverse(input=input, out=out)
input_np = np.random.random([4, 4]).astype("float64")
result_np = np.linalg.inv(input_np)
exe = fluid.Executor(place)
fetches = exe.run(fluid.default_main_program(),
feed={"input": input_np},
fetch_list=[result])
self.assertTrue(np.allclose(fetches[0], np.linalg.inv(input_np)))
def test_static(self):
for place in self.places:
self.check_static_result(place=place)
def test_dygraph(self):
for place in self.places:
with fluid.dygraph.guard(place):
input_np = np.random.random([4, 4]).astype("float64")
input = fluid.dygraph.to_variable(input_np)
result = paddle.inverse(input)
self.assertTrue(
np.allclose(result.numpy(), np.linalg.inv(input_np)))
class TestInverseAPIError(unittest.TestCase):
def test_errors(self):
input_np = np.random.random([4, 4]).astype("float64")
# input must be Variable.
self.assertRaises(TypeError, paddle.inverse, input_np)
# The data type of input must be float32 or float64.
for dtype in ["bool", "int32", "int64", "float16"]:
input = fluid.data(name='input_' + dtype, shape=[4, 4], dtype=dtype)
self.assertRaises(TypeError, paddle.inverse, input)
# When out is set, the data type must be the same as input.
input = fluid.data(name='input_1', shape=[4, 4], dtype="float32")
out = fluid.data(name='output', shape=[4, 4], dtype="float64")
self.assertRaises(TypeError, paddle.inverse, input, out)
# The number of dimensions of input must be >= 2.
input = fluid.data(name='input_2', shape=[4], dtype="float32")
self.assertRaises(ValueError, paddle.inverse, input)
if __name__ == "__main__":
unittest.main()
...@@ -39,7 +39,8 @@ NEED_FIX_FP64_CHECK_GRAD_THRESHOLD_OP_LIST = [ ...@@ -39,7 +39,8 @@ NEED_FIX_FP64_CHECK_GRAD_THRESHOLD_OP_LIST = [
'spp', \ 'spp', \
'teacher_student_sigmoid_loss', \ 'teacher_student_sigmoid_loss', \
'unpool', \ 'unpool', \
'yolov3_loss' 'yolov3_loss', \
'inverse'
] ]
NEED_FIX_FP64_CHECK_OUTPUT_THRESHOLD_OP_LIST = ['bilinear_interp'] NEED_FIX_FP64_CHECK_OUTPUT_THRESHOLD_OP_LIST = ['bilinear_interp']
...@@ -145,7 +145,7 @@ from .math import div #DEFINE_ALIAS ...@@ -145,7 +145,7 @@ from .math import div #DEFINE_ALIAS
from .math import add #DEFINE_ALIAS from .math import add #DEFINE_ALIAS
from .math import atan #DEFINE_ALIAS from .math import atan #DEFINE_ALIAS
from .math import logsumexp #DEFINE_ALIAS from .math import logsumexp #DEFINE_ALIAS
# from .math import inverse #DEFINE_ALIAS from .math import inverse #DEFINE_ALIAS
from .math import log1p #DEFINE_ALIAS from .math import log1p #DEFINE_ALIAS
from .math import erf #DEFINE_ALIAS from .math import erf #DEFINE_ALIAS
from .math import addcmul #DEFINE_ALIAS from .math import addcmul #DEFINE_ALIAS
......
...@@ -105,7 +105,7 @@ __all__ = [ ...@@ -105,7 +105,7 @@ __all__ = [
'add', 'add',
'atan', 'atan',
'logsumexp', 'logsumexp',
# 'inverse', 'inverse',
'log1p', 'log1p',
'erf', 'erf',
'addcmul', 'addcmul',
...@@ -986,6 +986,7 @@ def mm(input, mat2, out=None, name=None): ...@@ -986,6 +986,7 @@ def mm(input, mat2, out=None, name=None):
'Y': mat2}, outputs={'Out': out}) 'Y': mat2}, outputs={'Out': out})
return out return out
def addmm(input, x, y, alpha=1.0, beta=1.0, name=None): def addmm(input, x, y, alpha=1.0, beta=1.0, name=None):
""" """
**addmm** **addmm**
...@@ -1120,6 +1121,78 @@ def logsumexp(x, dim=None, keepdim=False, out=None, name=None): ...@@ -1120,6 +1121,78 @@ def logsumexp(x, dim=None, keepdim=False, out=None, name=None):
return layers.log(sum_out, name) return layers.log(sum_out, name)
def inverse(input, out=None, name=None):
"""
Takes the inverse of the square matrix. A square matrix is a matrix with
the same number of rows and columns. The input can be a square matrix
(2-D Tensor) or batches of square matrices.
Args:
input (Variable): The input Variable which holds a Tensor. The last two
dimensions should be equal. When the number of dimensions is
greater than 2, it is treated as batches of square matrix. The data
type can be float32 and float64.
out (Variable, optional): Optional output which can be any created
Variable that meets the requirements to store the result of operation.
If out is None, a new Varibale will be create to store the result.
name (str, optional): The default value is None. Normally there is no need for
user to set this property. For more information,
please refer to :ref:`api_guide_Name`
Returns:
Variable: A Tensor holds the inverse of input. The shape and data type
is the same as input.
Examples:
.. code-block:: python
import numpy as np
import paddle
import paddle.fluid as fluid
mat_np = np.array([[2, 0], [0, 2]]).astype("float32")
# example for static graph
input = fluid.data("input", shape=[2, 2], dtype="float32")
out = paddle.inverse(input)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
results = exe.run(feed={"input": mat_np },
fetch_list=[out.name])
print(results[0]) # [[0.5, 0], [0, 0.5]]
# example for dynamic graph
with fluid.dygraph.guard():
mat = fluid.dygraph.to_variable(mat_np)
inv = paddle.inverse(mat)
print(inv) # [[0.5, 0], [0, 0.5]]
"""
if in_dygraph_mode():
return core.ops.inverse(input)
def _check_input(input):
check_variable_and_dtype(input, 'input',
['float32', 'float64'], 'inverse')
if len(input.shape) < 2:
raise ValueError(
"The input of inverse is expected to be a Tensor whose number "
"of dimensions is no less than 2. But reviced: %d, "
"input's shape: %s." % (len(input.shape), input.shape))
if out is not None:
check_variable_and_dtype(out, 'out', input.dtype, 'inverse')
_check_input(input)
helper = LayerHelper('inverse', **locals())
if out is None:
out = helper.create_variable_for_type_inference(dtype=input.dtype)
helper.append_op(
type='inverse', inputs={'Input': [input] }, outputs={'Output': [out]})
return out
def max(input, dim=None, keep_dim=False, out=None, name=None): def max(input, dim=None, keep_dim=False, out=None, name=None):
""" """
Computes the maximum of tensor elements over the given dimension. Computes the maximum of tensor elements over the given dimension.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册