“896a37b6e3953f2093d7608100539c5c1c50fc36”上不存在“paddle/phi/kernels/tril_grad_kernel.h”
提交 c1c6b869 编写于 作者: P phlrain

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add_some_yaml_config

...@@ -1256,7 +1256,7 @@ if __name__ == "__main__": ...@@ -1256,7 +1256,7 @@ if __name__ == "__main__":
# Node Definition Generation # Node Definition Generation
definition_declaration_pair = GenerateForwardDefinition( definition_declaration_pair = GenerateForwardDefinition(
fwd_api_name, bwd_api_name, forward_inputs_position_map, fwd_api_name, bwd_api_name, forward_inputs_position_map,
forward_outputs_position_map, forward_attrs_list, forward_outputs_position_map, orig_forward_attrs_list,
backward_fwd_input_map, backward_grad_input_map, backward_fwd_input_map, backward_grad_input_map,
backward_grad_output_map, backward_attrs_list, optional_inputs, backward_grad_output_map, backward_attrs_list, optional_inputs,
intermediate_outputs) intermediate_outputs)
...@@ -1268,7 +1268,7 @@ if __name__ == "__main__": ...@@ -1268,7 +1268,7 @@ if __name__ == "__main__":
# For python-level API dispatch # For python-level API dispatch
CollectCoreOpsInformation(fwd_api_name, forward_inputs_position_map, CollectCoreOpsInformation(fwd_api_name, forward_inputs_position_map,
forward_outputs_position_map, forward_outputs_position_map,
forward_attrs_list) orig_forward_attrs_list)
if len(namespace) > 0: if len(namespace) > 0:
forward_definition_str += f"""namespace {namespace} {{ forward_definition_str += f"""namespace {namespace} {{
......
...@@ -34,6 +34,14 @@ void TransDataDevice(const Tensor &in, const platform::Place &dst_place, ...@@ -34,6 +34,14 @@ void TransDataDevice(const Tensor &in, const platform::Place &dst_place,
return; return;
} }
// NOTE(hqp): Special case for CPU->MLU, avoid stream sync.
if (platform::is_cpu_place(in.place()) && platform::is_mlu_place(dst_place)) {
paddle::framework::TensorCopy(
in, dst_place, *platform::DeviceContextPool::Instance().Get(dst_place),
out);
return;
}
// NOTE(yy): TransDataDevice should wait for computation of input. // NOTE(yy): TransDataDevice should wait for computation of input.
if (!platform::is_cuda_pinned_place(in.place())) { if (!platform::is_cuda_pinned_place(in.place())) {
platform::DeviceContextPool::Instance().Get(in.place())->Wait(); platform::DeviceContextPool::Instance().Get(in.place())->Wait();
......
...@@ -95,6 +95,7 @@ std::map<std::string, std::vector<ir::Node *>> Graph::InitFromBlock( ...@@ -95,6 +95,7 @@ std::map<std::string, std::vector<ir::Node *>> Graph::InitFromBlock(
std::unordered_map<std::string, std::pair<VarDesc *, int>> std::unordered_map<std::string, std::pair<VarDesc *, int>>
name_to_desc_block_id; name_to_desc_block_id;
block_id_ = block.ID();
const BlockDesc *block_var_visible = &block; const BlockDesc *block_var_visible = &block;
while (block_var_visible != nullptr) { while (block_var_visible != nullptr) {
for (auto *var : block_var_visible->AllVars()) { for (auto *var : block_var_visible->AllVars()) {
......
...@@ -230,6 +230,7 @@ class Graph { ...@@ -230,6 +230,7 @@ class Graph {
auto *x = auto *x =
AddNode(new ir::Node(var_desc, block_id == -1 ? block_id_ : block_id)); AddNode(new ir::Node(var_desc, block_id == -1 ? block_id_ : block_id));
x->SetId(num_node_created_++); x->SetId(num_node_created_++);
x->SetGraphId(block_id_);
return x; return x;
} }
...@@ -245,6 +246,7 @@ class Graph { ...@@ -245,6 +246,7 @@ class Graph {
"The OpDesc used to create operator node is null.")); "The OpDesc used to create operator node is null."));
auto *x = AddNode(new ir::Node(op_desc)); auto *x = AddNode(new ir::Node(op_desc));
x->SetId(num_node_created_++); x->SetId(num_node_created_++);
x->SetGraphId(block_id_);
return x; return x;
} }
...@@ -263,6 +265,7 @@ class Graph { ...@@ -263,6 +265,7 @@ class Graph {
num_node_created_); num_node_created_);
auto *x = AddNode(new ir::Node(name, ir::Node::Type::kVariable, block_id_)); auto *x = AddNode(new ir::Node(name, ir::Node::Type::kVariable, block_id_));
x->SetId(num_node_created_++); x->SetId(num_node_created_++);
x->SetGraphId(block_id_);
return x; return x;
} }
...@@ -276,6 +279,7 @@ class Graph { ...@@ -276,6 +279,7 @@ class Graph {
} }
auto *x = AddNode(new ir::Node(name, type, block_id_)); auto *x = AddNode(new ir::Node(name, type, block_id_));
x->SetId(num_node_created_++); x->SetId(num_node_created_++);
x->SetGraphId(block_id_);
return x; return x;
} }
......
...@@ -125,6 +125,7 @@ class Node { ...@@ -125,6 +125,7 @@ class Node {
// Only use this for auto parallel. // Only use this for auto parallel.
// A node does not have original desc if the return is zero. // A node does not have original desc if the return is zero.
uint64_t OriginalDescId() const { return original_desc_id_; } uint64_t OriginalDescId() const { return original_desc_id_; }
int GraphId() const { return graph_id_; }
bool IsOp() const { return type_ == Type::kOperation; } bool IsOp() const { return type_ == Type::kOperation; }
bool IsVar() const { return type_ == Type::kVariable; } bool IsVar() const { return type_ == Type::kVariable; }
...@@ -246,10 +247,12 @@ class Node { ...@@ -246,10 +247,12 @@ class Node {
// Store the original id of var desc or op desc. // Store the original id of var desc or op desc.
// Only use this for auto parallel. // Only use this for auto parallel.
uint64_t original_desc_id_{0}; uint64_t original_desc_id_{0};
int graph_id_{-1};
private: private:
// ID can only set by a Graph. // ID can only set by a Graph.
void SetId(int id) { id_ = id; } void SetId(int id) { id_ = id; }
void SetGraphId(int graph_id) { graph_id_ = graph_id; }
// desc_order can only set by a Graph when constructing a Graph from a // desc_order can only set by a Graph when constructing a Graph from a
// BlockDesc. // BlockDesc.
......
...@@ -1456,7 +1456,8 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const { ...@@ -1456,7 +1456,8 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
kernel_iter = kernels.find(expected_kernel_key); kernel_iter = kernels.find(expected_kernel_key);
} }
#endif #endif
#ifdef PADDLE_WITH_XPU
#if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP)
if (platform::is_xpu_place(expected_kernel_key.place_) && if (platform::is_xpu_place(expected_kernel_key.place_) &&
(kernel_iter == kernels.end() || (kernel_iter == kernels.end() ||
!paddle::platform::is_xpu_support_op(type_, expected_kernel_key) || !paddle::platform::is_xpu_support_op(type_, expected_kernel_key) ||
...@@ -1470,18 +1471,37 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const { ...@@ -1470,18 +1471,37 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
#endif #endif
#ifdef PADDLE_WITH_XPU_KP #ifdef PADDLE_WITH_XPU_KP
if (paddle::platform::is_xpu_place(expected_kernel_key.place_)) {
bool use_xpu_kp_kernel_rt = bool use_xpu_kp_kernel_rt =
FLAGS_run_kp_kernel && FLAGS_run_kp_kernel &&
paddle::platform::is_xpu_kp_support_op(type_, expected_kernel_key); paddle::platform::is_xpu_kp_support_op(type_, expected_kernel_key);
bool use_xpu_kp_kernel_debug = bool use_xpu_kp_kernel_debug =
paddle::platform::is_in_xpu_kpwhite_list(type_); paddle::platform::is_in_xpu_kpwhite_list(type_);
if (platform::is_xpu_place(expected_kernel_key.place_) && if (use_xpu_kp_kernel_rt) {
(use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug)) { VLOG(3) << "xpu_kp using rt mode ";
}
if (use_xpu_kp_kernel_debug) {
VLOG(3) << "xpu_kp using debug mode ";
}
bool is_xpu_kp_support = (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug);
if (is_xpu_kp_support) {
expected_kernel_key.library_type_ = LibraryType::kKP; expected_kernel_key.library_type_ = LibraryType::kKP;
kernel_iter = kernels.find(expected_kernel_key); kernel_iter = kernels.find(expected_kernel_key);
VLOG(3) << "using XPU KP kernel: " << type_ VLOG(3) << "using XPU KP kernel: " << type_
<< ", using_kernel_key:" << expected_kernel_key; << ", using_kernel_key:" << expected_kernel_key;
} }
bool is_xpu_unsupport =
(!paddle::platform::is_xpu_support_op(type_, expected_kernel_key) ||
paddle::platform::is_in_xpu_black_list(type_));
if (!is_xpu_kp_support &&
(kernel_iter == kernels.end() || is_xpu_unsupport)) {
VLOG(3) << "missing XPU kernel: " << type_
<< ", expected_kernel_key:" << expected_kernel_key
<< ", fallbacking to CPU one!";
expected_kernel_key.place_ = platform::CPUPlace();
kernel_iter = kernels.find(expected_kernel_key);
}
}
#endif #endif
#ifdef PADDLE_WITH_IPU #ifdef PADDLE_WITH_IPU
......
...@@ -1224,8 +1224,12 @@ void TensorFromStream(std::istream& is, Tensor* tensor, ...@@ -1224,8 +1224,12 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
proto::VarType::TensorDesc desc; proto::VarType::TensorDesc desc;
{ // int32_t size { // int32_t size
// proto buffer // proto buffer
int32_t size; int32_t size = -1;
is.read(reinterpret_cast<char*>(&size), sizeof(size)); is.read(reinterpret_cast<char*>(&size), sizeof(size));
PADDLE_ENFORCE_EQ(is.good(), true, platform::errors::Unavailable(
"Cannot read tensor desc size"));
PADDLE_ENFORCE_GE(size, 0, platform::errors::InvalidArgument(
"Tensor desc size should >= 0"));
std::unique_ptr<char[]> buf(new char[size]); std::unique_ptr<char[]> buf(new char[size]);
is.read(reinterpret_cast<char*>(buf.get()), size); is.read(reinterpret_cast<char*>(buf.get()), size);
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
......
...@@ -124,7 +124,7 @@ AmpOperators::AmpOperators() ...@@ -124,7 +124,7 @@ AmpOperators::AmpOperators()
OpSupportedInfos("GPU", paddle::framework::proto::VarType::BF16)); OpSupportedInfos("GPU", paddle::framework::proto::VarType::BF16));
unsupported_bf16_ops_->insert(unsupported_ops_gpu_bf16.begin(), unsupported_bf16_ops_->insert(unsupported_ops_gpu_bf16.begin(),
unsupported_ops_gpu_bf16.end()); unsupported_ops_gpu_bf16.end());
// NOTE: GPU/NPU/XPU is compiled seperatly. // NOTE: GPU/NPU/XPU/MLU is compiled seperatly.
#elif defined(PADDLE_WITH_ASCEND_CL) #elif defined(PADDLE_WITH_ASCEND_CL)
auto unsupported_ops_npu_fp16 = std::get<2>( auto unsupported_ops_npu_fp16 = std::get<2>(
OpSupportedInfos("NPU", paddle::framework::proto::VarType::FP16)); OpSupportedInfos("NPU", paddle::framework::proto::VarType::FP16));
...@@ -143,6 +143,15 @@ AmpOperators::AmpOperators() ...@@ -143,6 +143,15 @@ AmpOperators::AmpOperators()
OpSupportedInfos("XPU", paddle::framework::proto::VarType::BF16)); OpSupportedInfos("XPU", paddle::framework::proto::VarType::BF16));
unsupported_bf16_ops_->insert(unsupported_ops_xpu_bf16.begin(), unsupported_bf16_ops_->insert(unsupported_ops_xpu_bf16.begin(),
unsupported_ops_xpu_bf16.end()); unsupported_ops_xpu_bf16.end());
#elif defined(PADDLE_WITH_MLU)
auto unsupported_ops_mlu_fp16 = std::get<2>(
OpSupportedInfos("MLU", paddle::framework::proto::VarType::FP16));
unsupported_fp16_ops_->insert(unsupported_ops_mlu_fp16.begin(),
unsupported_ops_mlu_fp16.end());
auto unsupported_ops_mlu_bf16 = std::get<2>(
OpSupportedInfos("MLU", paddle::framework::proto::VarType::BF16));
unsupported_bf16_ops_->insert(unsupported_ops_mlu_bf16.begin(),
unsupported_ops_mlu_bf16.end());
#endif #endif
VLOG(4) << allow_ops_->size() << " " << block_ops_->size() << " " VLOG(4) << allow_ops_->size() << " " << block_ops_->size() << " "
<< unsupported_fp16_ops_->size() << " " << unsupported_fp16_ops_->size() << " "
...@@ -210,6 +219,7 @@ inline bool NeedCast(const std::shared_ptr<VarType>& var) { ...@@ -210,6 +219,7 @@ inline bool NeedCast(const std::shared_ptr<VarType>& var) {
if (paddle::platform::is_gpu_place(place) || if (paddle::platform::is_gpu_place(place) ||
paddle::platform::is_cuda_pinned_place(place) || paddle::platform::is_cuda_pinned_place(place) ||
paddle::platform::is_xpu_place(place) || paddle::platform::is_xpu_place(place) ||
paddle::platform::is_mlu_place(place) ||
paddle::platform::is_npu_place(place) || paddle::platform::is_npu_place(place) ||
paddle::platform::is_npu_pinned_place(place)) { paddle::platform::is_npu_pinned_place(place)) {
// CudaPinndePlace is added for varbase created by dataloader // CudaPinndePlace is added for varbase created by dataloader
......
...@@ -234,7 +234,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins, ...@@ -234,7 +234,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
auto& kernels = kernels_iter->second; auto& kernels = kernels_iter->second;
auto kernel_iter = kernels.find(expected_kernel_key); auto kernel_iter = kernels.find(expected_kernel_key);
#ifdef PADDLE_WITH_XPU #if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP)
if (paddle::platform::is_xpu_place(expected_kernel_key.place_) && if (paddle::platform::is_xpu_place(expected_kernel_key.place_) &&
(kernel_iter == kernels.end() || is_xpu_unsupport)) { (kernel_iter == kernels.end() || is_xpu_unsupport)) {
VLOG(3) << "missing XPU kernel: " << op.Type() VLOG(3) << "missing XPU kernel: " << op.Type()
...@@ -243,11 +243,10 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins, ...@@ -243,11 +243,10 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
expected_kernel_key.place_ = platform::CPUPlace(); expected_kernel_key.place_ = platform::CPUPlace();
kernel_iter = kernels.find(expected_kernel_key); kernel_iter = kernels.find(expected_kernel_key);
} }
#endif #endif
#ifdef PADDLE_WITH_XPU_KP #ifdef PADDLE_WITH_XPU_KP
expected_kernel_key.place_ = platform::XPUPlace(); if (paddle::platform::is_xpu_place(expected_kernel_key.place_)) {
bool use_xpu_kp_kernel_rt = bool use_xpu_kp_kernel_rt =
FLAGS_run_kp_kernel && FLAGS_run_kp_kernel &&
paddle::platform::is_xpu_kp_support_op(op.Type(), expected_kernel_key); paddle::platform::is_xpu_kp_support_op(op.Type(), expected_kernel_key);
...@@ -259,14 +258,22 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins, ...@@ -259,14 +258,22 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
if (use_xpu_kp_kernel_debug) { if (use_xpu_kp_kernel_debug) {
VLOG(3) << "xpu_kp using debug mode "; VLOG(3) << "xpu_kp using debug mode ";
} }
if (paddle::platform::is_xpu_place(expected_kernel_key.place_) && bool is_xpu_kp_support = (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug);
(use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug)) { if (is_xpu_kp_support) {
expected_kernel_key.place_ = platform::XPUPlace();
expected_kernel_key.library_type_ = paddle::framework::LibraryType::kKP; expected_kernel_key.library_type_ = paddle::framework::LibraryType::kKP;
kernel_iter = kernels.find(expected_kernel_key); kernel_iter = kernels.find(expected_kernel_key);
VLOG(3) << "using XPU KP kernel: " << op.Type() VLOG(3) << "using XPU KP kernel: " << op.Type()
<< ", using_kernel_key:" << expected_kernel_key; << ", using_kernel_key:" << expected_kernel_key;
} }
if (!is_xpu_kp_support &&
(kernel_iter == kernels.end() || is_xpu_unsupport)) {
VLOG(3) << "missing XPU kernel: " << op.Type()
<< ", expected_kernel_key:" << expected_kernel_key
<< ", fallbacking to CPU one!";
expected_kernel_key.place_ = platform::CPUPlace();
kernel_iter = kernels.find(expected_kernel_key);
}
}
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL #ifdef PADDLE_WITH_ASCEND_CL
......
...@@ -341,7 +341,6 @@ void BuildDygraphPhiKernelContext( ...@@ -341,7 +341,6 @@ void BuildDygraphPhiKernelContext(
} }
for (size_t i = 0; i < attr_names.size(); ++i) { for (size_t i = 0; i < attr_names.size(); ++i) {
VLOG(1) << "############## attr_name: " << i << " : " << attr_names[i];
if (attr_defs[i].type_index == std::type_index(typeid(phi::ScalarArray))) { if (attr_defs[i].type_index == std::type_index(typeid(phi::ScalarArray))) {
if (attrs.find(attr_names[i]) != if (attrs.find(attr_names[i]) !=
attrs.end()) { // shape is in the attribute attrs.end()) { // shape is in the attribute
......
...@@ -1485,6 +1485,13 @@ REGISTER_ACTIVATION_OP(atanh, Atanh, AtanhFunctor, AtanhGradFunctor); ...@@ -1485,6 +1485,13 @@ REGISTER_ACTIVATION_OP(atanh, Atanh, AtanhFunctor, AtanhGradFunctor);
REGISTER_ACTIVATION_OP(brelu, BRelu, BReluFunctor, BReluGradFunctor); REGISTER_ACTIVATION_OP(brelu, BRelu, BReluFunctor, BReluGradFunctor);
REGISTER_ACTIVATION_OP(thresholded_relu, ThresholdedRelu, REGISTER_ACTIVATION_OP(thresholded_relu, ThresholdedRelu,
ThresholdedReluFunctor, ThresholdedReluGradFunctor); ThresholdedReluFunctor, ThresholdedReluGradFunctor);
REGISTER_ACTIVATION_OP(hard_shrink, HardShrink, HardShrinkFunctor,
HardShrinkGradFunctor);
REGISTER_ACTIVATION_OP(softshrink, SoftShrink, SoftShrinkFunctor,
SoftShrinkGradFunctor);
REGISTER_ACTIVATION_OP(tanh_shrink, TanhShrink, TanhShrinkFunctor,
TanhShrinkGradFunctor);
REGISTER_ACTIVATION_OP(silu, Silu, SiluFunctor, SiluGradFunctor);
/* ========================== sigmoid register ============================= /* ========================== sigmoid register =============================
*/ */
...@@ -1626,22 +1633,6 @@ REGISTER_OPERATOR( ...@@ -1626,22 +1633,6 @@ REGISTER_OPERATOR(
ops::ActivationOpDoubleGrad<ops::ELUGradFunctor<float>::FwdDeps()>, ops::ActivationOpDoubleGrad<ops::ELUGradFunctor<float>::FwdDeps()>,
ops::ActivationDoubleGradOpInplaceInferer); ops::ActivationDoubleGradOpInplaceInferer);
REGISTER_OP_CPU_KERNEL(elu,
ops::ActivationKernel<paddle::platform::CPUDeviceContext,
ops::ELUFunctor<float>>,
ops::ActivationKernel<paddle::platform::CPUDeviceContext,
ops::ELUFunctor<double>>);
REGISTER_OP_CPU_KERNEL(
elu_grad, ops::ELUGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::ELUGradKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
elu_grad_grad, ops::ELUDoubleGradKernel<plat::CPUDeviceContext,
ops::ELUGradGradFunctor<float>>,
ops::ELUDoubleGradKernel<plat::CPUDeviceContext,
ops::ELUGradGradFunctor<double>>,
ops::ELUDoubleGradKernel<plat::CPUDeviceContext,
ops::ELUGradGradFunctor<plat::float16>>);
/* ========================================================================== */ /* ========================================================================== */
/* ======================== logit register ============================ /* ======================== logit register ============================
......
...@@ -279,6 +279,15 @@ USE_PHI_FUNCTOR(BRelu) ...@@ -279,6 +279,15 @@ USE_PHI_FUNCTOR(BRelu)
USE_PHI_FUNCTOR(ThresholdedRelu) USE_PHI_FUNCTOR(ThresholdedRelu)
USE_PHI_FUNCTOR(LeakyRelu) USE_PHI_FUNCTOR(LeakyRelu)
USE_PHI_DOUBLE_GRAD_FUNCTOR(LeakyRelu) USE_PHI_DOUBLE_GRAD_FUNCTOR(LeakyRelu)
USE_PHI_FUNCTOR(HardShrink)
USE_PHI_FUNCTOR(SoftShrink)
USE_PHI_FUNCTOR(TanhShrink)
USE_PHI_FUNCTOR(Silu)
USE_PHI_FUNCTOR(ELU)
USE_PHI_DOUBLE_GRAD_FUNCTOR(ELU)
template <typename T>
using ELUGradNegativeAlphaFunctor = phi::funcs::ELUGradNegativeAlphaFunctor<T>;
template <typename T> template <typename T>
struct SigmoidGradFunctor : public BaseActivationFunctor<T> { struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
...@@ -392,31 +401,6 @@ struct SigmoidTripleGradFunctor : public BaseActivationFunctor<T> { ...@@ -392,31 +401,6 @@ struct SigmoidTripleGradFunctor : public BaseActivationFunctor<T> {
} }
}; };
// silu(x) = x / (1 + exp(-x))
template <typename T>
struct SiluFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
auto temp = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
out.device(d) = x * temp;
}
};
// silu'(x) = (1 / (1 + e^{-x})) * (1 + out * e^{-x}))
template <typename T>
struct SiluGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
auto temp1 = static_cast<T>(1) + (-x).exp(); // 1+e^(-x)
auto temp2 = x * (-x).exp(); // x*e^(-x)
dx.device(d) = dout * ((static_cast<T>(1) / temp1) *
(static_cast<T>(1) + (temp2 / temp1)));
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
// Originally: logsigmoid(x) = -log (1 + exp(-x)) // Originally: logsigmoid(x) = -log (1 + exp(-x))
// For numerical stability, we can use the log-sum-exp trick: // For numerical stability, we can use the log-sum-exp trick:
// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/ // https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/
...@@ -512,99 +496,6 @@ using ReluGradGradFunctor = phi::funcs::ReluGradGradFunctor<T>; ...@@ -512,99 +496,6 @@ using ReluGradGradFunctor = phi::funcs::ReluGradGradFunctor<T>;
template <typename T> template <typename T>
using ReluCUDAFunctor = phi::funcs::ReluCUDAFunctor<T>; using ReluCUDAFunctor = phi::funcs::ReluCUDAFunctor<T>;
// tanhshrink(x) = x - tanh(x)
// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
template <typename T>
struct TanhShrinkFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
out.device(d) = x - x.tanh();
}
};
template <typename T>
struct TanhShrinkGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) = dout * (x.tanh() * x.tanh());
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
// tanhshrink(x) = x - tanh(x)
// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
template <typename T>
struct HardShrinkFunctor : public BaseActivationFunctor<T> {
float threshold;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"threshold", &threshold}};
}
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
auto temp1 = x < static_cast<T>(threshold * -1.f);
auto temp2 = x > static_cast<T>(threshold);
out.device(d) = x * (temp1 || temp2).template cast<T>();
}
};
template <typename T>
struct HardShrinkGradFunctor : public BaseActivationFunctor<T> {
float threshold;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"threshold", &threshold}};
}
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
auto temp1 = x < static_cast<T>(threshold * -1.f);
auto temp2 = x > static_cast<T>(threshold);
dx.device(d) = dout * (temp1 || temp2).template cast<T>();
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
// softshrink(x) = x - lambda, if x > lambda; x + lambda, if x < -lambda; 0
// otherwise
template <typename T>
struct SoftShrinkFunctor : public BaseActivationFunctor<T> {
float lambda;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"lambda", &lambda}};
}
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
auto lambdaT = static_cast<T>(lambda);
auto temp1 = (x > lambdaT).template cast<T>();
auto temp2 = (x < -lambdaT).template cast<T>();
out.device(d) = temp1 * (x - lambdaT) + temp2 * (x + lambdaT);
}
};
template <typename T>
struct SoftShrinkGradFunctor : public BaseActivationFunctor<T> {
float lambda;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"lambda", &lambda}};
}
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
auto lambdaT = static_cast<T>(lambda);
auto temp1 = (x > lambdaT).template cast<T>();
auto temp2 = (x < -lambdaT).template cast<T>();
dx.device(d) = dout * (temp1 + temp2).template cast<T>();
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
// sqrt(x) = x^(1/2) // sqrt(x) = x^(1/2)
template <typename T> template <typename T>
struct SqrtFunctor : public BaseActivationFunctor<T> { struct SqrtFunctor : public BaseActivationFunctor<T> {
...@@ -1036,59 +927,6 @@ struct SoftReluGradFunctor : public BaseActivationFunctor<T> { ...@@ -1036,59 +927,6 @@ struct SoftReluGradFunctor : public BaseActivationFunctor<T> {
} }
}; };
template <typename T>
struct ELUFunctor : public BaseActivationFunctor<T> {
float alpha;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"alpha", &alpha}};
}
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
out.device(d) =
(x < static_cast<T>(0))
.select(static_cast<T>(alpha) * (x.exp() - static_cast<T>(1)), x);
}
};
template <typename T>
struct ELUGradFunctor : public BaseActivationFunctor<T> {
float alpha;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"alpha", &alpha}};
}
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
// case 1: alpha >= 0
// dx = dout, if out > 0
// dx = dout * (out + alpha), if out <= 0
dx.device(d) = (out > static_cast<T>(0))
.select(dout, dout * (out + static_cast<T>(alpha)));
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct ELUGradNegativeAlphaFunctor : public BaseActivationFunctor<T> {
float alpha;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"alpha", &alpha}};
}
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
// case 2: alpha < 0
// dx = dout, if x > 0
// dx = dout * (out + alpha), if x <=0
dx.device(d) = (x > static_cast<T>(0))
.select(dout, dout * static_cast<T>(alpha) * x.exp());
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class ELUGradKernel : public framework::OpKernel<T> { class ELUGradKernel : public framework::OpKernel<T> {
public: public:
...@@ -1354,44 +1192,6 @@ struct AbsGradGradFunctor : public BaseActivationFunctor<T> { ...@@ -1354,44 +1192,6 @@ struct AbsGradGradFunctor : public BaseActivationFunctor<T> {
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
}; };
template <typename T>
struct ELUGradGradFunctor : public BaseActivationFunctor<T> {
float alpha;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"alpha", &alpha}};
}
template <typename Device>
void operator()(const Device& dev, const framework::Tensor* X,
const framework::Tensor* ddX, framework::Tensor* ddOut,
const framework::Tensor* dOut, framework::Tensor* dX) const {
auto* d = dev.eigen_device();
auto ddx = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(ddX, "Input", "DDX", "ELUGradGrad"));
auto x = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(X, "Input", "X", "ELUGradGrad"));
if (dX) {
auto dx = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(dX, "Output", "DX", "ELUGradGrad"));
auto dout = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(dOut, "Output", "DOut", "ELUGradGrad"));
dx.device(*d) = ddx * dout * static_cast<T>(alpha) * x.exp() *
(x <= static_cast<T>(0)).template cast<T>();
}
if (ddOut) {
auto ddout = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(ddOut, "Output", "DDOut", "ELUGradGrad"));
ddout.device(*d) = ddx *
((x > static_cast<T>(0)).template cast<T>() +
static_cast<T>(alpha) * x.exp() *
(x <= static_cast<T>(0)).template cast<T>())
.template cast<T>();
}
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T> template <typename T>
struct CELUGradGradFunctor : public BaseActivationFunctor<T> { struct CELUGradGradFunctor : public BaseActivationFunctor<T> {
float alpha; float alpha;
...@@ -2152,9 +1952,7 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> { ...@@ -2152,9 +1952,7 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> {
} // namespace paddle } // namespace paddle
#define FOR_EACH_ACTIVATION_OP(__macro) \ #define FOR_EACH_ACTIVATION_OP(__macro) \
__macro(silu, Silu, SiluFunctor, SiluGradFunctor); \
__macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \ __macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \
__macro(softshrink, SoftShrink, SoftShrinkFunctor, SoftShrinkGradFunctor); \
__macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor); \ __macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor); \
__macro(floor, Floor, FloorFunctor, ZeroGradFunctor); \ __macro(floor, Floor, FloorFunctor, ZeroGradFunctor); \
__macro(round, Round, RoundFunctor, ZeroGradFunctor); \ __macro(round, Round, RoundFunctor, ZeroGradFunctor); \
...@@ -2167,8 +1965,6 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> { ...@@ -2167,8 +1965,6 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> {
__macro(softplus, Softplus, SoftplusFunctor, SoftplusGradFunctor); \ __macro(softplus, Softplus, SoftplusFunctor, SoftplusGradFunctor); \
__macro(softsign, Softsign, SoftsignFunctor, SoftsignGradFunctor); \ __macro(softsign, Softsign, SoftsignFunctor, SoftsignGradFunctor); \
__macro(relu6, Relu6, Relu6Functor, Relu6GradFunctor); \ __macro(relu6, Relu6, Relu6Functor, Relu6GradFunctor); \
__macro(tanh_shrink, TanhShrink, TanhShrinkFunctor, TanhShrinkGradFunctor); \
__macro(hard_shrink, HardShrink, HardShrinkFunctor, HardShrinkGradFunctor); \
__macro(hard_sigmoid, HardSigmoid, HardSigmoidFunctor, \ __macro(hard_sigmoid, HardSigmoid, HardSigmoidFunctor, \
HardSigmoidGradFunctor); \ HardSigmoidGradFunctor); \
__macro(swish, Swish, SwishFunctor, SwishGradFunctor); \ __macro(swish, Swish, SwishFunctor, SwishGradFunctor); \
......
...@@ -44,35 +44,6 @@ struct CudaSigmoidGradFunctor : public BaseActivationFunctor<T> { ...@@ -44,35 +44,6 @@ struct CudaSigmoidGradFunctor : public BaseActivationFunctor<T> {
} }
}; };
template <typename T>
struct CudaSiluFunctor : public BaseActivationFunctor<T> {
using MPType = typename details::MPTypeTrait<T>::Type;
MPType one = static_cast<MPType>(1.0f);
// silu(x) = x / (1 + exp(-x))
__device__ __forceinline__ T operator()(const T arg_x) const {
MPType x = static_cast<MPType>(arg_x);
return static_cast<T>(x / (one + exp(-x)));
}
};
template <typename T>
struct CudaSiluGradFunctor : public BaseActivationFunctor<T> {
using MPType = typename details::MPTypeTrait<T>::Type;
MPType one = static_cast<MPType>(1.0f);
// dx = dout * (1 + exp(-x) + x * exp(-x) / (1 + exp(-x))^2)
__device__ __forceinline__ T operator()(const T arg_dout,
const T arg_x) const {
MPType dout = static_cast<MPType>(arg_dout);
MPType x = static_cast<MPType>(arg_x);
MPType temp = one / (one + exp(-x));
return static_cast<T>(dout * (temp * (one + x * (one - temp))));
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T> template <typename T>
struct CudaLogSigmoidFunctor : public BaseActivationFunctor<T> { struct CudaLogSigmoidFunctor : public BaseActivationFunctor<T> {
using MPType = typename details::MPTypeTrait<T>::Type; using MPType = typename details::MPTypeTrait<T>::Type;
...@@ -110,43 +81,6 @@ struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor<T> { ...@@ -110,43 +81,6 @@ struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor<T> {
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
}; };
template <typename T>
struct CudaSoftShrinkFunctor : public BaseActivationFunctor<T> {
float lambda;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"lambda", &lambda}};
}
// softshrink(x) = x - lambda, if x > lambda;
// x + lambda, if x < -lambda;
// 0, otherwise.
__device__ __forceinline__ T operator()(const T x) const {
T l = static_cast<T>(lambda);
T temp1 = static_cast<T>(x > l);
T temp2 = static_cast<T>(x < -l);
return temp1 * (x - l) + temp2 * (x + l);
}
};
template <typename T>
struct CudaSoftShrinkGradFunctor : public BaseActivationFunctor<T> {
T zero = static_cast<T>(0.0f);
float lambda;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"lambda", &lambda}};
}
// dx = dout, if x > lambda or x < -lambda else 0
__device__ __forceinline__ T operator()(const T dout, const T x) const {
T l = static_cast<T>(lambda);
return (x >= -l && x <= l) ? zero : dout;
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T> template <typename T>
struct CudaCeilFunctor : public BaseActivationFunctor<T> { struct CudaCeilFunctor : public BaseActivationFunctor<T> {
using MPType = typename details::MPTypeTrait<T>::Type; using MPType = typename details::MPTypeTrait<T>::Type;
...@@ -615,66 +549,6 @@ struct CudaRelu6GradFunctor : public BaseActivationFunctor<T> { ...@@ -615,66 +549,6 @@ struct CudaRelu6GradFunctor : public BaseActivationFunctor<T> {
} }
}; };
template <typename T>
struct CudaTanhShrinkFunctor : public BaseActivationFunctor<T> {
using MPType = typename details::MPTypeTrait<T>::Type;
// tanhshrink(x) = x - tanh(x)
__device__ __forceinline__ T operator()(const T arg_x) const {
MPType x = static_cast<MPType>(arg_x);
return static_cast<T>(x - tanh(x));
}
};
template <typename T>
struct CudaTanhShrinkGradFunctor : public BaseActivationFunctor<T> {
using MPType = typename details::MPTypeTrait<T>::Type;
// dx = dout * tanh(x)^2
__device__ __forceinline__ T operator()(const T arg_dout,
const T arg_x) const {
MPType dout = static_cast<MPType>(arg_dout);
MPType x = static_cast<MPType>(arg_x);
return static_cast<T>(dout * tanh(x) * tanh(x));
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct CudaHardShrinkFunctor : public BaseActivationFunctor<T> {
T zero = static_cast<T>(0.0f);
float threshold;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"threshold", &threshold}};
}
// hadrshrink(x) = (x > -threshold && x < threshold) ? 0 : x
__device__ __forceinline__ T operator()(const T x) const {
T t = static_cast<T>(threshold);
return (x > -t && x < t) ? zero : x;
}
};
template <typename T>
struct CudaHardShrinkGradFunctor : public BaseActivationFunctor<T> {
T zero = static_cast<T>(0.0f);
float threshold;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"threshold", &threshold}};
}
// dx = (x > -threshold && x < threshold) ? 0 : dout
__device__ __forceinline__ T operator()(const T dout, const T x) const {
T t = static_cast<T>(threshold);
return (x > -t && x < t) ? zero : dout;
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T> template <typename T>
struct CudaHardSigmoidFunctor : public BaseActivationFunctor<T> { struct CudaHardSigmoidFunctor : public BaseActivationFunctor<T> {
T zero = static_cast<T>(0.0f); T zero = static_cast<T>(0.0f);
...@@ -863,110 +737,6 @@ struct CudaHardSwishGradFunctor : public BaseActivationFunctor<T> { ...@@ -863,110 +737,6 @@ struct CudaHardSwishGradFunctor : public BaseActivationFunctor<T> {
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
}; };
template <typename T>
struct CudaELUFunctor : public BaseActivationFunctor<T> {
using CT = typename details::MPTypeTrait<T>::Type;
CT zero = static_cast<CT>(0.0f);
CT one = static_cast<CT>(1.0f);
float alpha;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"alpha", &alpha}};
}
// elu(x) = x, if x > 0
// elu(x) = alpha * (e^x - 1), if x <= 0
__device__ __forceinline__ T operator()(const T arg_x) const {
CT x = static_cast<CT>(arg_x);
CT temp = static_cast<CT>(alpha) * (exp(x) - one);
CT res = x > zero ? x : temp;
return static_cast<T>(res);
}
};
template <typename T>
struct CudaELUGradFunctor : public BaseActivationFunctor<T> {
using MPType = typename details::MPTypeTrait<T>::Type;
MPType zero = static_cast<MPType>(0.0f);
float alpha;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"alpha", &alpha}};
}
// case 1: alpha >= 0
// dx = dout, if out > 0
// dx = dout * (out + alpha), if out <= 0
__device__ __forceinline__ T operator()(T arg_dout, T arg_out) const {
MPType dout = static_cast<MPType>(arg_dout);
MPType out = static_cast<MPType>(arg_out);
MPType a = static_cast<MPType>(alpha);
MPType out_pos = static_cast<MPType>(out > zero);
MPType out_neg = static_cast<MPType>(out <= zero);
return static_cast<T>(dout * (out_pos + out_neg * (out + a)));
}
static constexpr ActBwdOpFwdDeps FwdDeps() {
return ActBwdOpFwdDeps::kDepOut;
}
};
template <typename T>
struct CudaELUGradNegativeAlphaFunctor : public BaseActivationFunctor<T> {
using MPType = typename details::MPTypeTrait<T>::Type;
MPType zero = static_cast<MPType>(0.0f);
float alpha;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"alpha", &alpha}};
}
// case 2: alpha < 0
// dx = dout, if x > 0
// dx = dout * (out + alpha), if x <=0
__device__ __forceinline__ T operator()(const T arg_dout, const T arg_out,
const T arg_x) const {
MPType dout = static_cast<MPType>(arg_dout);
MPType out = static_cast<MPType>(arg_out);
MPType x = static_cast<MPType>(arg_x);
MPType a = static_cast<MPType>(alpha);
MPType x_pos = static_cast<MPType>(x > zero);
MPType x_neg = static_cast<MPType>(x <= zero);
return static_cast<T>(dout * (x_pos + x_neg * (out + a)));
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename DeviceContext, typename T>
class ELUGradCudaKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const {
auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* out = ctx.Input<framework::Tensor>("Out");
auto* x = ctx.Input<framework::Tensor>("X");
auto* d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
d_x->mutable_data<T>(ctx.GetPlace());
const float alpha = ctx.Attr<float>("alpha");
auto& dev_ctx = ctx.device_context<DeviceContext>();
std::vector<const framework::Tensor*> ins = {d_out, out};
std::vector<framework::Tensor*> outs = {d_x};
if (alpha > 0) {
CudaELUGradFunctor<T> functor;
functor.alpha = alpha;
paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
&outs, functor);
} else {
CudaELUGradNegativeAlphaFunctor<T> functor;
functor.alpha = alpha;
ins.push_back(x);
paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
&outs, functor);
}
}
};
template <typename T> template <typename T>
struct CudaCELUFunctor : public BaseActivationFunctor<T> { struct CudaCELUFunctor : public BaseActivationFunctor<T> {
using CT = typename details::MPTypeTrait<T>::Type; using CT = typename details::MPTypeTrait<T>::Type;
...@@ -1099,6 +869,15 @@ USE_PHI_FUNCTOR(CudaTanh) ...@@ -1099,6 +869,15 @@ USE_PHI_FUNCTOR(CudaTanh)
USE_PHI_FUNCTOR(CudaBRelu) USE_PHI_FUNCTOR(CudaBRelu)
USE_PHI_FUNCTOR(CudaLeakyRelu) USE_PHI_FUNCTOR(CudaLeakyRelu)
USE_PHI_FUNCTOR(CudaThresholdedRelu) USE_PHI_FUNCTOR(CudaThresholdedRelu)
USE_PHI_FUNCTOR(CudaHardShrink)
USE_PHI_FUNCTOR(CudaSoftShrink)
USE_PHI_FUNCTOR(CudaTanhShrink)
USE_PHI_FUNCTOR(CudaSilu)
USE_PHI_FUNCTOR(CudaELU)
template <typename T>
using CudaELUGradNegativeAlphaFunctor =
phi::funcs::CudaELUGradNegativeAlphaFunctor<T>;
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -1158,26 +937,6 @@ namespace plat = paddle::platform; ...@@ -1158,26 +937,6 @@ namespace plat = paddle::platform;
ops::ActivationGradCudaKernel<plat::CUDADeviceContext, \ ops::ActivationGradCudaKernel<plat::CUDADeviceContext, \
ops::grad_functor<plat::bfloat16>>); ops::grad_functor<plat::bfloat16>>);
/* ======================== elu register ============================ */
REGISTER_OP_CUDA_KERNEL(
elu, ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
ops::CudaELUFunctor<float>>,
ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
ops::CudaELUFunctor<double>>,
ops::ActivationCudaKernel<plat::CUDADeviceContext,
ops::CudaELUFunctor<plat::float16>>);
REGISTER_OP_CUDA_KERNEL(
elu_grad, ops::ELUGradCudaKernel<plat::CUDADeviceContext, float>,
ops::ELUGradCudaKernel<plat::CUDADeviceContext, double>,
ops::ELUGradCudaKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL(
elu_grad_grad, ops::ELUDoubleGradKernel<plat::CUDADeviceContext,
ops::ELUGradGradFunctor<float>>,
ops::ELUDoubleGradKernel<plat::CUDADeviceContext,
ops::ELUGradGradFunctor<double>>,
ops::ELUDoubleGradKernel<plat::CUDADeviceContext,
ops::ELUGradGradFunctor<plat::float16>>);
/* ========================================================================== */ /* ========================================================================== */
/* ======================== celu register ============================ */ /* ======================== celu register ============================ */
...@@ -1359,7 +1118,6 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -1359,7 +1118,6 @@ REGISTER_OP_CUDA_KERNEL(
/* ========================================================================== */ /* ========================================================================== */
#define FOR_EACH_ACTIVATION_CUDA_OP(__macro) \ #define FOR_EACH_ACTIVATION_CUDA_OP(__macro) \
__macro(silu, Silu, CudaSiluFunctor, CudaSiluGradFunctor); \
__macro(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor, \ __macro(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor, \
CudaLogSigmoidGradFunctor); \ CudaLogSigmoidGradFunctor); \
__macro(softshrink, SoftShrink, CudaSoftShrinkFunctor, \ __macro(softshrink, SoftShrink, CudaSoftShrinkFunctor, \
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/batch_norm_op.h" #include "paddle/fluid/operators/batch_norm_op.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle { namespace paddle {
...@@ -20,6 +21,8 @@ namespace operators { ...@@ -20,6 +21,8 @@ namespace operators {
template <typename T> template <typename T>
class MLUBatchNormOpKernel : public framework::OpKernel<T> { class MLUBatchNormOpKernel : public framework::OpKernel<T> {
using MPDType = typename details::MPTypeTrait<T>::Type;
public: public:
void Compute(const framework::ExecutionContext &ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
const auto &place = ctx.GetPlace(); const auto &place = ctx.GetPlace();
...@@ -68,10 +71,10 @@ class MLUBatchNormOpKernel : public framework::OpKernel<T> { ...@@ -68,10 +71,10 @@ class MLUBatchNormOpKernel : public framework::OpKernel<T> {
// alloc memory // alloc memory
y->mutable_data<T>(place); y->mutable_data<T>(place);
mean_out->mutable_data<T>(place); mean_out->mutable_data<MPDType>(place);
variance_out->mutable_data<T>(place); variance_out->mutable_data<MPDType>(place);
saved_mean->mutable_data<T>(place); saved_mean->mutable_data<MPDType>(place);
saved_variance->mutable_data<T>(place); saved_variance->mutable_data<MPDType>(place);
Tensor transformed_x; Tensor transformed_x;
Tensor transformed_y; Tensor transformed_y;
...@@ -132,6 +135,8 @@ class MLUBatchNormOpKernel : public framework::OpKernel<T> { ...@@ -132,6 +135,8 @@ class MLUBatchNormOpKernel : public framework::OpKernel<T> {
template <typename T> template <typename T>
class MLUBatchNormGradOpKernel : public framework::OpKernel<T> { class MLUBatchNormGradOpKernel : public framework::OpKernel<T> {
using MPDType = typename details::MPTypeTrait<T>::Type;
public: public:
void Compute(const framework::ExecutionContext &ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
const auto *x = ctx.Input<Tensor>("X"); const auto *x = ctx.Input<Tensor>("X");
...@@ -154,10 +159,10 @@ class MLUBatchNormGradOpKernel : public framework::OpKernel<T> { ...@@ -154,10 +159,10 @@ class MLUBatchNormGradOpKernel : public framework::OpKernel<T> {
auto &dev_ctx = ctx.template device_context<MLUDeviceContext>(); auto &dev_ctx = ctx.template device_context<MLUDeviceContext>();
auto d_x_tmp = auto d_x_tmp =
ctx.AllocateTmpTensor<T, MLUDeviceContext>(x->dims(), dev_ctx); ctx.AllocateTmpTensor<T, MLUDeviceContext>(x->dims(), dev_ctx);
auto scale_grad_tmp = auto scale_grad_tmp = ctx.AllocateTmpTensor<MPDType, MLUDeviceContext>(
ctx.AllocateTmpTensor<T, MLUDeviceContext>(scale->dims(), dev_ctx); scale->dims(), dev_ctx);
auto bias_grad_tmp = auto bias_grad_tmp =
ctx.AllocateTmpTensor<T, MLUDeviceContext>(bias->dims(), dev_ctx); ctx.AllocateTmpTensor<MPDType, MLUDeviceContext>(bias->dims(), dev_ctx);
if (d_x == nullptr) { if (d_x == nullptr) {
d_x = &d_x_tmp; d_x = &d_x_tmp;
...@@ -171,8 +176,8 @@ class MLUBatchNormGradOpKernel : public framework::OpKernel<T> { ...@@ -171,8 +176,8 @@ class MLUBatchNormGradOpKernel : public framework::OpKernel<T> {
const auto &place = ctx.GetPlace(); const auto &place = ctx.GetPlace();
d_x->mutable_data<T>(place); d_x->mutable_data<T>(place);
d_scale->mutable_data<T>(place); d_scale->mutable_data<MPDType>(place);
d_bias->mutable_data<T>(place); d_bias->mutable_data<MPDType>(place);
use_global_stats = is_test || use_global_stats; use_global_stats = is_test || use_global_stats;
......
...@@ -12,8 +12,10 @@ ...@@ -12,8 +12,10 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/phi/infermeta/unary.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -21,14 +23,6 @@ namespace operators { ...@@ -21,14 +23,6 @@ namespace operators {
class CumprodOp : public framework::OperatorWithKernel { class CumprodOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Cumprod");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Cumprod");
ctx->ShareDim("X", "Out");
ctx->ShareLoD("X", "Out");
}
}; };
class CumprodOpMaker : public framework::OpProtoAndCheckerMaker { class CumprodOpMaker : public framework::OpProtoAndCheckerMaker {
...@@ -82,9 +76,12 @@ class CumprodGradOp : public framework::OperatorWithKernel { ...@@ -82,9 +76,12 @@ class CumprodGradOp : public framework::OperatorWithKernel {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
DECLARE_INFER_SHAPE_FUNCTOR(cumprod, CumprodInferShapeFunctor,
PD_INFER_META(phi::UnchangedInferMeta));
REGISTER_OPERATOR(cumprod, ops::CumprodOp, ops::CumprodOpMaker, REGISTER_OPERATOR(cumprod, ops::CumprodOp, ops::CumprodOpMaker,
ops::CumprodGradOpMaker<paddle::framework::OpDesc>, ops::CumprodGradOpMaker<paddle::framework::OpDesc>,
ops::CumprodGradOpMaker<paddle::imperative::OpBase>); ops::CumprodGradOpMaker<paddle::imperative::OpBase>,
CumprodInferShapeFunctor);
REGISTER_OPERATOR(cumprod_grad, ops::CumprodGradOp); REGISTER_OPERATOR(cumprod_grad, ops::CumprodGradOp);
...@@ -15,9 +15,14 @@ limitations under the License. */ ...@@ -15,9 +15,14 @@ limitations under the License. */
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/phi/core/ddim.h" #include "paddle/phi/core/ddim.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/backward.h"
#include "paddle/phi/infermeta/binary.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -26,58 +31,6 @@ class GatherOp : public framework::OperatorWithKernel { ...@@ -26,58 +31,6 @@ class GatherOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
platform::errors::InvalidArgument(
"Input(X) of GatherOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasInput("Index"), true,
platform::errors::InvalidArgument(
"Input(Index) of GatherOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
platform::errors::InvalidArgument(
"Output(Out) of GatherOp should not be null."));
auto index_dims = ctx->GetInputDim("Index");
if (index_dims.size() == 2) {
PADDLE_ENFORCE_EQ(
index_dims[1], 1,
platform::errors::InvalidArgument(
"The last dim of index should be 1 when it is 2D, but we get %d",
index_dims[1]));
} else {
PADDLE_ENFORCE_EQ(
index_dims.size(), 1,
platform::errors::InvalidArgument(
"The index should be 1D, when it is not 2D, but we get %d",
index_dims.size()));
}
auto axis = ctx->Attrs().Get<int>("axis");
auto input_dim = ctx->GetInputDim("X");
if (ctx->HasInput("Axis") || axis == 0) {
// if HasInput("Axis"), we can not obtain correct shape of output
int batch_size = index_dims[0];
framework::DDim output_dims(input_dim);
output_dims[0] = batch_size;
ctx->SetOutputDim("Out", output_dims);
ctx->ShareLoD("X", /*->*/ "Out");
} else {
int index_size = index_dims[0];
std::vector<int> out_dim_vec;
for (int i = 0; i < axis; i++) {
out_dim_vec.push_back(input_dim[i]);
}
out_dim_vec.push_back(index_size);
for (int i = axis + 1; i < input_dim.size(); i++) {
out_dim_vec.push_back(input_dim[i]);
}
auto output_dims = phi::make_ddim(out_dim_vec);
ctx->SetOutputDim("Out", output_dims);
ctx->ShareLoD("X", /*->*/ "Out");
}
}
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
...@@ -100,11 +53,6 @@ class GatherGradOp : public framework::OperatorWithKernel { ...@@ -100,11 +53,6 @@ class GatherGradOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
ctx->ShareLoD("X", /*-->*/ framework::GradVarName("X"));
}
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
...@@ -193,11 +141,17 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(GatherGradNoNeedBufferVarInferer, "X"); ...@@ -193,11 +141,17 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(GatherGradNoNeedBufferVarInferer, "X");
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
DECLARE_INFER_SHAPE_FUNCTOR(gather, GatherInferShapeFunctor,
PD_INFER_META(phi::GatherInferMeta));
REGISTER_OPERATOR(gather, ops::GatherOp, ops::GatherOpMaker, REGISTER_OPERATOR(gather, ops::GatherOp, ops::GatherOpMaker,
ops::GatherGradOpMaker<paddle::framework::OpDesc>, ops::GatherGradOpMaker<paddle::framework::OpDesc>,
ops::GatherGradOpMaker<paddle::imperative::OpBase>); ops::GatherGradOpMaker<paddle::imperative::OpBase>,
GatherInferShapeFunctor);
DECLARE_INFER_SHAPE_FUNCTOR(gather_grad, GatherGradInferShapeFunctor,
PD_INFER_META(phi::GeneralUnaryGradInferMeta));
REGISTER_OPERATOR(gather_grad, ops::GatherGradOp, REGISTER_OPERATOR(gather_grad, ops::GatherGradOp,
ops::GatherGradNoNeedBufferVarInferer); ops::GatherGradNoNeedBufferVarInferer,
GatherGradInferShapeFunctor);
REGISTER_OP_VERSION(gather) REGISTER_OP_VERSION(gather)
.AddCheckpoint(R"ROC(upgrad gather, add a new input [Axis])ROC", .AddCheckpoint(R"ROC(upgrad gather, add a new input [Axis])ROC",
......
...@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/grid_sampler_op.h"
#include <memory> #include <memory>
#include <string> #include <string>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
...@@ -229,15 +229,6 @@ REGISTER_OPERATOR(grid_sampler, ops::GridSampleOp, ops::GridSampleOpMaker, ...@@ -229,15 +229,6 @@ REGISTER_OPERATOR(grid_sampler, ops::GridSampleOp, ops::GridSampleOpMaker,
ops::GridSampleGradMaker<paddle::imperative::OpBase>); ops::GridSampleGradMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(grid_sampler_grad, ops::GridSampleOpGrad); REGISTER_OPERATOR(grid_sampler_grad, ops::GridSampleOpGrad);
REGISTER_OP_CPU_KERNEL(
grid_sampler,
ops::GridSampleOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::GridSampleOpKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
grid_sampler_grad,
ops::GridSampleGradOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::GridSampleGradOpKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_VERSION(grid_sampler) REGISTER_OP_VERSION(grid_sampler)
.AddCheckpoint( .AddCheckpoint(
R"ROC( R"ROC(
......
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/grid_sampler_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
namespace paddle {
namespace operators {
static __forceinline__ __device__ bool in_bounds(int h, int w, int H, int W) {
return h >= 0 && h < H && w >= 0 && w < W;
}
template <typename T>
static __forceinline__ __device__ void atomic_add(T* data, int h, int w, int sH,
int sW, int H, int W,
T delta) {
if (in_bounds(h, w, H, W)) {
platform::CudaAtomicAdd(data + h * sH + w * sW, delta);
}
}
template <typename T>
static __forceinline__ __device__ T _unnormalize(T coord, int size,
bool align_corners) {
if (align_corners) {
return ((coord + 1.f) / 2) * (size - 1);
} else {
return ((coord + 1.f) * size - 1) / 2;
}
}
template <typename T>
static __forceinline__ __device__ T clip_indexes(T in, int max_value) {
return min(static_cast<T>(max_value), max(in, static_cast<T>(0)));
}
template <typename T>
static __forceinline__ __device__ T reflect_indexes(T in, int twice_low,
int twice_high) {
if (twice_low == twice_high) {
return static_cast<T>(0);
}
T min = static_cast<T>(twice_low) / 2;
T span = static_cast<T>(twice_high - twice_low) / 2;
in = fabs(in - min);
T extra = fmod(in, span);
int flips = static_cast<int>(floor(in / span));
if (flips % 2 == 0) {
return extra + min;
} else {
return span - extra + min;
}
}
template <typename T>
static __forceinline__ __device__ T compute_positions(T coord, int size,
PaddingMode padding_mode,
bool align_corners) {
coord = _unnormalize<T>(coord, size, align_corners);
if (padding_mode == PaddingMode::border) {
coord = clip_indexes(coord, size - 1);
} else if (padding_mode == PaddingMode::reflect) {
if (align_corners) {
coord = reflect_indexes(coord, 0, 2 * (size - 1));
} else {
coord = reflect_indexes(coord, -1, 2 * size - 1);
}
coord = clip_indexes(coord, size - 1);
}
return coord;
}
template <typename T>
static __forceinline__ __device__ T _unnormalize_with_mask(T coord, int size,
bool align_corners,
T* grad_in) {
if (align_corners) {
*grad_in = static_cast<T>(size - 1) / 2;
return ((coord + 1.f) / 2) * (size - 1);
} else {
*grad_in = static_cast<T>(size) / 2;
return ((coord + 1.f) * size - 1) / 2;
}
}
template <typename T>
static __forceinline__ __device__ T clip_indexes_with_mask(T in, int clip_limit,
T* grad_in) {
if (in <= static_cast<T>(0)) {
*grad_in = static_cast<T>(0);
return static_cast<T>(0);
} else {
T max = static_cast<T>(clip_limit - 1);
if (in >= max) {
*grad_in = static_cast<T>(0);
return max;
} else {
*grad_in = static_cast<T>(1);
return in;
}
}
}
template <typename T>
static __forceinline__ __device__ T
reflect_indexes_with_mask(T in, int twice_low, int twice_high, T* grad_in) {
if (twice_low == twice_high) {
*grad_in = static_cast<T>(0);
return static_cast<T>(0);
}
int grad_in_mult_;
T min = static_cast<T>(twice_low) / 2;
T span = static_cast<T>(twice_high - twice_low) / 2;
in = in - min;
if (in < static_cast<T>(0)) {
grad_in_mult_ = -1;
in = -in;
} else {
grad_in_mult_ = 1;
}
T extra = fmod(in, span);
int flips = static_cast<int>(floor(in / span));
if (flips % 2 == 0) {
*grad_in = static_cast<T>(grad_in_mult_);
return extra + min;
} else {
*grad_in = static_cast<T>(-grad_in_mult_);
return span - extra + min;
}
}
template <typename T>
static __forceinline__ __device__ T
compute_positions_with_mask(T coord, int size, PaddingMode padding_mode,
bool align_corners, T* grad_in) {
T grad_clip, grad_refl;
coord = _unnormalize_with_mask<T>(coord, size, align_corners, grad_in);
if (padding_mode == PaddingMode::border) {
coord = clip_indexes_with_mask(coord, size, &grad_clip);
*grad_in = (*grad_in) * grad_clip;
} else if (padding_mode == PaddingMode::reflect) {
if (align_corners) {
coord = reflect_indexes_with_mask(coord, 0, 2 * (size - 1), &grad_refl);
} else {
coord = reflect_indexes_with_mask(coord, -1, 2 * size - 1, &grad_refl);
}
coord = clip_indexes_with_mask(coord, size, &grad_clip);
*grad_in = (*grad_in) * grad_refl * grad_clip;
}
return coord;
}
template <typename T>
__global__ void grid_sample_cuda_kernel(const int nthreads, int n, int out_c,
int out_h, int out_w, int in_h,
int in_w, const T* input, const T* grid,
T* output, const Mode mode,
const PaddingMode padding_mode,
bool align_corners) {
int inp_sN = out_c * in_h * in_w;
int inp_sC = in_h * in_w;
int inp_sH = in_w;
int inp_sW = 1;
int grid_sN = out_h * out_w * 2;
int grid_sH = out_w * 2;
int grid_sW = 2;
int grid_sCoor = 1;
int out_sN = out_c * out_h * out_w;
int out_sC = out_h * out_w;
int out_sH = out_w;
int out_sW = 1;
CUDA_KERNEL_LOOP(index, nthreads) {
const int w = index % out_w;
const int h = (index / out_w) % out_h;
const int n = index / (out_h * out_w);
const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
T ix = grid[grid_offset];
T iy = grid[grid_offset + grid_sCoor];
ix = compute_positions(ix, in_w, padding_mode, align_corners);
iy = compute_positions(iy, in_h, padding_mode, align_corners);
if (mode == Mode::bilinear) {
int ix_nw = static_cast<int>(floor(ix));
int iy_nw = static_cast<int>(floor(iy));
int ix_ne = ix_nw + 1;
int iy_ne = iy_nw;
int ix_sw = ix_nw;
int iy_sw = iy_nw + 1;
int ix_se = ix_nw + 1;
int iy_se = iy_nw + 1;
T nw = (ix_se - ix) * (iy_se - iy);
T ne = (ix - ix_sw) * (iy_sw - iy);
T sw = (ix_ne - ix) * (iy - iy_ne);
T se = (ix - ix_nw) * (iy - iy_nw);
auto inp_offset_NC = n * inp_sN;
auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
for (int c = 0; c < out_c;
++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) {
*out_ptr_NCHW = static_cast<T>(0);
if (in_bounds(iy_nw, ix_nw, in_h, in_w)) {
*out_ptr_NCHW +=
input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW] * nw;
}
if (in_bounds(iy_ne, ix_ne, in_h, in_w)) {
*out_ptr_NCHW +=
input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW] * ne;
}
if (in_bounds(iy_sw, ix_sw, in_h, in_w)) {
*out_ptr_NCHW +=
input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW] * sw;
}
if (in_bounds(iy_se, ix_se, in_h, in_w)) {
*out_ptr_NCHW +=
input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW] * se;
}
}
} else if (mode == Mode::nearest) {
int ix_nearest = static_cast<int>(std::nearbyint(ix));
int iy_nearest = static_cast<int>(std::nearbyint(iy));
auto inp_offset_NC = n * inp_sN;
auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
for (int c = 0; c < out_c;
++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) {
if (in_bounds(iy_nearest, ix_nearest, in_h, in_w)) {
*out_ptr_NCHW =
input[inp_offset_NC + iy_nearest * inp_sH + ix_nearest * inp_sW];
} else {
*out_ptr_NCHW = static_cast<T>(0);
}
}
}
}
}
template <typename T>
class GridSampleOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx = ctx.cuda_device_context();
auto align_corners = ctx.Attr<bool>("align_corners");
auto padding_mode_s = ctx.Attr<std::string>("padding_mode");
auto mode_s = ctx.Attr<std::string>("mode");
PaddingMode padding_mode;
Mode mode;
if (padding_mode_s == "border") {
padding_mode = PaddingMode::border;
} else if (padding_mode_s == "reflection") {
padding_mode = PaddingMode::reflect;
} else {
padding_mode = PaddingMode::zeros;
}
if (mode_s == "nearest") {
mode = Mode::nearest;
} else {
mode = Mode::bilinear;
}
auto* input = ctx.Input<Tensor>("X");
auto* grid = ctx.Input<Tensor>("Grid");
const int n = grid->dims()[0];
const int out_h = grid->dims()[1];
const int out_w = grid->dims()[2];
const int c = input->dims()[1];
const int in_h = input->dims()[2];
const int in_w = input->dims()[3];
VLOG(3) << "n: " << n << "; c: " << c << "; out_h: " << out_h
<< "; out_w: " << out_w;
auto* output = ctx.Output<Tensor>("Output");
auto* output_data = output->mutable_data<T>(ctx.GetPlace());
VLOG(3) << "out dims: " << output->dims()[0] << "; " << output->dims()[1]
<< "; " << output->dims()[2] << "; " << output->dims()[3];
int count = static_cast<int>(n * out_h * out_w);
auto cu_stream = dev_ctx.stream();
platform::GpuLaunchConfig config =
platform::GetGpuLaunchConfig1D(dev_ctx, count);
grid_sample_cuda_kernel<
T><<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
count, n, c, out_h, out_w, in_h, in_w, input->data<T>(),
grid->data<T>(), output_data, mode, padding_mode, align_corners);
}
};
template <typename T>
__global__ void grid_sampler_cuda_backward_kernel(
const int nthreads, const T* grad_output, const T* input, const T* grid,
int n, int out_c, int out_h, int out_w, int in_h, int in_w, T* grad_input,
T* grad_grid, const Mode mode, const PaddingMode padding_mode,
bool align_corners) {
int inp_sN = out_c * in_h * in_w;
int inp_sC = in_h * in_w;
int inp_sH = in_w;
int inp_sW = 1;
int grid_sN = out_h * out_w * 2;
int grid_sH = out_w * 2;
int grid_sW = 2;
int grid_sCoor = 1;
int gOut_sN = out_c * out_h * out_w;
int gOut_sC = out_h * out_w;
int gOut_sH = out_w;
int gOut_sW = 1;
CUDA_KERNEL_LOOP(index, nthreads) {
const int w = index % out_w;
const int h = (index / out_w) % out_h;
const int n = index / (out_h * out_w);
const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
T ix = grid[grid_offset];
T iy = grid[grid_offset + grid_sCoor];
T gix_mult, giy_mult;
ix = compute_positions_with_mask(ix, in_w, padding_mode, align_corners,
&gix_mult);
iy = compute_positions_with_mask(iy, in_h, padding_mode, align_corners,
&giy_mult);
if (mode == Mode::bilinear) {
int ix_nw = static_cast<int>(floor(ix));
int iy_nw = static_cast<int>(floor(iy));
int ix_ne = ix_nw + 1;
int iy_ne = iy_nw;
int ix_sw = ix_nw;
int iy_sw = iy_nw + 1;
int ix_se = ix_nw + 1;
int iy_se = iy_nw + 1;
T nw = (ix_se - ix) * (iy_se - iy);
T ne = (ix - ix_sw) * (iy_sw - iy);
T sw = (ix_ne - ix) * (iy - iy_ne);
T se = (ix - ix_nw) * (iy - iy_nw);
T gix = static_cast<T>(0), giy = static_cast<T>(0);
int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW;
T* gInp_ptr_NC = grad_input + n * inp_sN;
int inp_offset_NC = n * inp_sN;
for (int c = 0; c < out_c; ++c, inp_offset_NC += inp_sC,
gInp_ptr_NC += inp_sC, gOut_offset += gOut_sC) {
T gOut = grad_output[gOut_offset];
atomic_add(gInp_ptr_NC, iy_nw, ix_nw, inp_sH, inp_sW, in_h, in_w,
nw * gOut);
atomic_add(gInp_ptr_NC, iy_ne, ix_ne, inp_sH, inp_sW, in_h, in_w,
ne * gOut);
atomic_add(gInp_ptr_NC, iy_sw, ix_sw, inp_sH, inp_sW, in_h, in_w,
sw * gOut);
atomic_add(gInp_ptr_NC, iy_se, ix_se, inp_sH, inp_sW, in_h, in_w,
se * gOut);
if (in_bounds(iy_nw, ix_nw, in_h, in_w)) {
T nw_val = input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW];
gix -= nw_val * (iy_se - iy) * gOut;
giy -= nw_val * (ix_se - ix) * gOut;
}
if (in_bounds(iy_ne, ix_ne, in_h, in_w)) {
T ne_val = input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW];
gix += ne_val * (iy_sw - iy) * gOut;
giy -= ne_val * (ix - ix_sw) * gOut;
}
if (in_bounds(iy_sw, ix_sw, in_h, in_w)) {
T sw_val = input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW];
gix -= sw_val * (iy - iy_ne) * gOut;
giy += sw_val * (ix_ne - ix) * gOut;
}
if (in_bounds(iy_se, ix_se, in_h, in_w)) {
T se_val = input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW];
gix += se_val * (iy - iy_nw) * gOut;
giy += se_val * (ix - ix_nw) * gOut;
}
}
if (grad_grid != nullptr) {
T* gGrid_ptr_NHW = grad_grid + index * grid_sW;
gGrid_ptr_NHW[0] = gix_mult * gix;
gGrid_ptr_NHW[1] = giy_mult * giy;
}
} else if (mode == Mode::nearest) {
int ix_nearest = static_cast<int>(std::nearbyint(ix));
int iy_nearest = static_cast<int>(std::nearbyint(iy));
int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW;
T* gInp_ptr_NC = grad_input + n * inp_sN;
for (int c = 0; c < out_c;
++c, gInp_ptr_NC += inp_sC, gOut_offset += gOut_sC) {
atomic_add(gInp_ptr_NC, iy_nearest, ix_nearest, inp_sH, inp_sW, in_h,
in_w, grad_output[gOut_offset]);
}
if (grad_grid != nullptr) {
T* gGrid_ptr_NHW = grad_grid + index * grid_sW;
gGrid_ptr_NHW[0] = static_cast<T>(0);
gGrid_ptr_NHW[1] = static_cast<T>(0);
}
}
}
}
template <typename T>
class GridSampleGradOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx = ctx.cuda_device_context();
auto align_corners = ctx.Attr<bool>("align_corners");
auto padding_mode_s = ctx.Attr<std::string>("padding_mode");
auto mode_s = ctx.Attr<std::string>("mode");
PaddingMode padding_mode;
Mode mode;
if (padding_mode_s == "border") {
padding_mode = PaddingMode::border;
} else if (padding_mode_s == "reflection") {
padding_mode = PaddingMode::reflect;
} else {
padding_mode = PaddingMode::zeros;
}
if (mode_s == "nearest") {
mode = Mode::nearest;
} else {
mode = Mode::bilinear;
}
auto* input = ctx.Input<Tensor>("X");
auto* grid = ctx.Input<Tensor>("Grid");
auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
const int n = grid->dims()[0];
const int out_h = grid->dims()[1];
const int out_w = grid->dims()[2];
const int c = input->dims()[1];
const int in_h = input->dims()[2];
const int in_w = input->dims()[3];
auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
input_grad->mutable_data<T>(ctx.GetPlace());
phi::funcs::SetConstant<paddle::platform::CUDADeviceContext, T>()(
ctx.template device_context<paddle::platform::CUDADeviceContext>(),
input_grad, static_cast<T>(0));
T* grid_grad_data = nullptr;
if (ctx.HasOutput(framework::GradVarName("Grid"))) {
auto* grid_grad = ctx.Output<Tensor>(framework::GradVarName("Grid"));
grid_grad_data = grid_grad->mutable_data<T>(ctx.GetPlace());
}
int count = static_cast<int>(n * out_h * out_w);
auto cu_stream = dev_ctx.stream();
platform::GpuLaunchConfig config =
platform::GetGpuLaunchConfig1D(dev_ctx, count);
grid_sampler_cuda_backward_kernel<
T><<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
count, output_grad->data<T>(), input->data<T>(), grid->data<T>(), n, c,
out_h, out_w, in_h, in_w, input_grad->data<T>(), grid_grad_data, mode,
padding_mode, align_corners);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(grid_sampler, ops::GridSampleOpCUDAKernel<float>,
ops::GridSampleOpCUDAKernel<double>);
REGISTER_OP_CUDA_KERNEL(grid_sampler_grad,
ops::GridSampleGradOpCUDAKernel<float>,
ops::GridSampleGradOpCUDAKernel<double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <iostream>
#include <string>
#include <utility>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/hostdevice.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace paddle {
namespace operators {
enum class Mode {
bilinear,
nearest,
};
enum class PaddingMode { zeros, border, reflect };
using Tensor = framework::Tensor;
template <typename T, size_t D, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
using Array3 = Eigen::DSizes<int64_t, 3>;
using Array4 = Eigen::DSizes<int64_t, 4>;
template <typename T>
static inline bool isInBound(T x, T y, T x_max, T y_max) {
if (x < 0 || x > x_max || y < 0 || y > y_max) {
return false;
}
return true;
}
template <typename T>
static inline void unnormalize(const platform::CPUDeviceContext& ctx,
Tensor* grid_slice,
const int max_val, // height-1 or width-1
bool align_corners) {
auto& place = *ctx.eigen_device();
auto grid_slice_t = EigenTensor<T, 3>::From(*grid_slice);
if (!align_corners) {
auto factor = static_cast<T>((max_val + 1) * 0.5);
grid_slice_t.device(place) =
(grid_slice_t + static_cast<T>(1)) * factor - static_cast<T>(0.5);
} else {
auto factor = static_cast<T>(max_val * 0.5);
grid_slice_t.device(place) = (grid_slice_t + static_cast<T>(1)) * factor;
}
}
template <typename T>
static inline void clip(const platform::CPUDeviceContext& ctx,
Tensor* grid_slice,
const int max_val, // height-1 or width-1
bool align_corners, std::string padding_mode) {
auto& place = *ctx.eigen_device();
auto grid_slice_t = EigenTensor<T, 3>::From(*grid_slice);
if (padding_mode == "border") {
grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast<T>(0))
.cwiseMin(static_cast<T>(max_val));
} else if (padding_mode == "reflection") {
if (align_corners) {
auto double_range = static_cast<T>(max_val * 2);
auto grid_abs = grid_slice_t.abs();
auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
grid_slice_t.device(place) = extra.cwiseMin(double_range - extra);
if (max_val == 0) {
grid_slice_t.device(place) = grid_slice_t.constant(static_cast<T>(0));
}
} else {
auto double_range = static_cast<T>((max_val + 1) * 2);
auto grid_abs = (grid_slice_t + static_cast<T>(0.5)).abs();
auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
grid_slice_t.device(place) =
extra.cwiseMin(double_range - extra) - static_cast<T>(0.5);
grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast<T>(0))
.cwiseMin(static_cast<T>(max_val));
}
}
}
template <typename T>
static inline void clipWithMask(const platform::CPUDeviceContext& ctx,
const int max_val, // height-1 or width-1
bool align_corners, std::string padding_mode,
Tensor* grid_slice, Tensor* grid_scale) {
auto& place = *ctx.eigen_device();
grid_scale->mutable_data<T>(grid_slice->dims(), ctx.GetPlace());
auto grid_slice_t = EigenTensor<T, 3>::From(*grid_slice);
auto factor = static_cast<T>(max_val * 0.5);
if (!align_corners) {
factor = static_cast<T>((max_val + 1) * 0.5);
}
auto grid_scale_t = EigenTensor<T, 3>::From(*grid_scale).setConstant(factor);
if (padding_mode == "border") {
// auto bounded_lo = grid_slice_t.cwiseMax(static_cast<T>(0));
auto res = grid_slice_t.cwiseMax(static_cast<T>(0))
.cwiseMin(static_cast<T>(max_val));
auto in_bound = (res == grid_slice_t);
grid_scale_t.device(place) = grid_scale_t * in_bound.template cast<T>();
grid_slice_t.device(place) = res;
} else if (padding_mode == "reflection") {
if (align_corners) {
auto double_range = static_cast<T>(max_val * 2);
auto is_neg = (grid_slice_t < static_cast<T>(0));
auto grid_abs = grid_slice_t.abs();
auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
auto one_more_flip = (extra > (double_range - extra));
grid_scale_t.device(place) =
grid_scale_t * ((is_neg == one_more_flip).template cast<T>() -
(is_neg != one_more_flip).template cast<T>());
grid_slice_t.device(place) = extra.cwiseMin(double_range - extra);
if (max_val == 0) {
grid_slice_t.device(place) = grid_slice_t.constant(static_cast<T>(0));
}
} else {
auto double_range = static_cast<T>((max_val + 1) * 2);
auto grid_abs = (grid_slice_t + static_cast<T>(0.5)).abs();
auto is_neg = ((grid_slice_t + static_cast<T>(0.5)) < static_cast<T>(0));
auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
auto one_more_flip = (extra > (double_range - extra));
auto reflected =
extra.cwiseMin(double_range - extra) - static_cast<T>(0.5);
auto clipped = reflected.cwiseMax(static_cast<T>(0))
.cwiseMin(static_cast<T>(max_val));
auto in_bound = (clipped == reflected).template cast<T>();
grid_scale_t.device(place) =
grid_scale_t * ((is_neg == one_more_flip).template cast<T>() -
(is_neg != one_more_flip).template cast<T>()) *
in_bound;
grid_slice_t.device(place) = clipped;
}
}
}
template <typename T>
static void calcGridLocations(const platform::CPUDeviceContext& ctx,
const Tensor& grid, const int in_h,
const int in_w, bool align_corners,
std::string padding_mode, Tensor* grid_x,
Tensor* grid_y) {
const int n = grid.dims()[0];
const int out_h = grid.dims()[1];
const int out_w = grid.dims()[2];
// split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim
T* grid_x_data = grid_x->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
T* grid_y_data = grid_y->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
const T* grid_data = grid.data<T>();
for (int i = 0; i < n * out_h * out_w; i++) {
grid_x_data[i] = grid_data[2 * i];
grid_y_data[i] = grid_data[(2 * i) + 1];
}
unnormalize<T>(ctx, grid_x, in_w - 1, align_corners);
unnormalize<T>(ctx, grid_y, in_h - 1, align_corners);
clip<T>(ctx, grid_x, in_w - 1, align_corners, padding_mode);
clip<T>(ctx, grid_y, in_h - 1, align_corners, padding_mode);
}
template <typename T>
static void calcGridLocationsWithGrad(const platform::CPUDeviceContext& ctx,
const Tensor& grid, const int in_h,
const int in_w, bool align_corners,
std::string padding_mode, Tensor* grid_x,
Tensor* grid_y, Tensor* grid_x_scale,
Tensor* grid_y_scale) {
const int n = grid.dims()[0];
const int out_h = grid.dims()[1];
const int out_w = grid.dims()[2];
// split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim
T* grid_x_data = grid_x->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
T* grid_y_data = grid_y->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
const T* grid_data = grid.data<T>();
for (int i = 0; i < n * out_h * out_w; i++) {
grid_x_data[i] = grid_data[2 * i];
grid_y_data[i] = grid_data[(2 * i) + 1];
}
unnormalize<T>(ctx, grid_x, in_w - 1, align_corners);
unnormalize<T>(ctx, grid_y, in_h - 1, align_corners);
clipWithMask<T>(ctx, in_w - 1, align_corners, padding_mode, grid_x,
grid_x_scale);
clipWithMask<T>(ctx, in_h - 1, align_corners, padding_mode, grid_y,
grid_y_scale);
}
template <typename T>
static void getGridPointValue(const Tensor& input, Tensor* output,
const Tensor& x, const Tensor& y) {
const int n = input.dims()[0];
const int c = input.dims()[1];
const int in_h = input.dims()[2];
const int in_w = input.dims()[3];
const int out_h = x.dims()[1];
const int out_w = x.dims()[2];
auto x_t = EigenTensor<T, 3>::From(x);
auto y_t = EigenTensor<T, 3>::From(y);
auto output_t = EigenTensor<T, 4>::From(*output).setConstant((T)0);
auto input_t = EigenTensor<T, 4>::From(input);
for (int i = 0; i < n; i++) {
for (int k = 0; k < out_h; k++) {
for (int l = 0; l < out_w; l++) {
if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1),
(T)(in_h - 1))) {
for (int j = 0; j < c; j++) {
output_t(i, j, k, l) =
input_t(i, j, static_cast<int>(round(y_t(i, k, l))),
static_cast<int>(round(x_t(i, k, l))));
}
}
}
}
}
}
template <typename T>
static void allNeigbors(const platform::CPUDeviceContext& ctx,
const Tensor& input, Tensor* grid_x, Tensor* grid_y,
Tensor* x_w, Tensor* x_e, Tensor* y_n,
Tensor* y_s, // positions
Tensor* d_w, Tensor* d_e, Tensor* d_n,
Tensor* d_s, // distance
Tensor* v_wn, Tensor* v_en, Tensor* v_ws,
Tensor* v_es) { // values
auto& place = *ctx.eigen_device();
const int c = input.dims()[1];
const int n = grid_x->dims()[0];
const int out_h = grid_x->dims()[1];
const int out_w = grid_x->dims()[2];
// calculate coords of 4 corner points
x_w->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
x_e->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
y_n->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
y_s->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
auto x_w_t = EigenTensor<T, 3>::From(*x_w);
auto x_e_t = EigenTensor<T, 3>::From(*x_e);
auto y_n_t = EigenTensor<T, 3>::From(*y_n);
auto y_s_t = EigenTensor<T, 3>::From(*y_s);
auto grid_x_t = EigenTensor<T, 3>::From(*grid_x);
auto grid_y_t = EigenTensor<T, 3>::From(*grid_y);
x_w_t.device(place) = grid_x_t.floor();
x_e_t.device(place) = x_w_t + static_cast<T>(1);
y_n_t.device(place) = grid_y_t.floor();
y_s_t.device(place) = y_n_t + static_cast<T>(1);
// calculate distances to 4 sides
d_w->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
d_e->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
d_n->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
d_s->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
auto d_w_t = EigenTensor<T, 3>::From(*d_w);
auto d_e_t = EigenTensor<T, 3>::From(*d_e);
auto d_n_t = EigenTensor<T, 3>::From(*d_n);
auto d_s_t = EigenTensor<T, 3>::From(*d_s);
d_w_t.device(place) = grid_x_t - x_w_t;
d_e_t.device(place) = x_e_t - grid_x_t;
d_n_t.device(place) = grid_y_t - y_n_t;
d_s_t.device(place) = y_s_t - grid_y_t;
// calc 4 corner points value
v_wn->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
v_en->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
v_ws->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
v_es->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
getGridPointValue<T>(input, v_wn, *x_w, *y_n);
getGridPointValue<T>(input, v_en, *x_e, *y_n);
getGridPointValue<T>(input, v_ws, *x_w, *y_s);
getGridPointValue<T>(input, v_es, *x_e, *y_s);
}
template <typename T>
static void bilinearInter(const platform::CPUDeviceContext& ctx,
const Tensor& input, Tensor* grid_x, Tensor* grid_y,
Tensor* out) {
auto& place = *ctx.eigen_device();
const int n = grid_x->dims()[0];
const int out_h = grid_x->dims()[1];
const int out_w = grid_x->dims()[2];
const int c = input.dims()[1];
Tensor x_w, x_e, y_n, y_s;
Tensor d_w, d_e, d_n, d_s;
Tensor v_wn, v_en, v_ws, v_es;
allNeigbors<T>(ctx, input, grid_x, grid_y, &x_w, &x_e, &y_n, &y_s, &d_w, &d_e,
&d_n, &d_s, &v_wn, &v_en, &v_ws, &v_es);
auto d_w_t = EigenTensor<T, 3>::From(d_w);
auto d_e_t = EigenTensor<T, 3>::From(d_e);
auto d_n_t = EigenTensor<T, 3>::From(d_n);
auto d_s_t = EigenTensor<T, 3>::From(d_s);
auto d_w_scaled_t =
d_w_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
auto d_e_scaled_t =
d_e_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
auto d_n_scaled_t =
d_n_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
auto d_s_scaled_t =
d_s_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
auto v_wn_t = EigenTensor<T, 4>::From(v_wn);
auto v_en_t = EigenTensor<T, 4>::From(v_en);
auto v_ws_t = EigenTensor<T, 4>::From(v_ws);
auto v_es_t = EigenTensor<T, 4>::From(v_es);
auto output_t = EigenTensor<T, 4>::From(*out);
// bilinear interpolaetion by 4 corner points
output_t.device(place) = v_wn_t * d_e_scaled_t * d_s_scaled_t +
v_en_t * d_w_scaled_t * d_s_scaled_t +
v_ws_t * d_e_scaled_t * d_n_scaled_t +
v_es_t * d_w_scaled_t * d_n_scaled_t;
}
template <typename T>
static void nearestInter(const platform::CPUDeviceContext& ctx,
const Tensor& input, Tensor* grid_x, Tensor* grid_y,
Tensor* out) {
auto& place = *ctx.eigen_device();
auto grid_x_t = EigenTensor<T, 3>::From(*grid_x);
auto grid_y_t = EigenTensor<T, 3>::From(*grid_y);
grid_x_t = grid_x_t.round();
grid_y_t = grid_y_t.round();
getGridPointValue<T>(input, out, *grid_x, *grid_y);
}
template <typename T>
static void gatherOutputGradToInputGrad(const Tensor& output_grad,
Tensor* input_grad, const Tensor& x,
const Tensor& y, const Tensor& d1,
const Tensor& d2) {
const int n = output_grad.dims()[0];
const int c = output_grad.dims()[1];
const int out_h = output_grad.dims()[2];
const int out_w = output_grad.dims()[3];
const int in_h = input_grad->dims()[2];
const int in_w = input_grad->dims()[3];
auto x_t = EigenTensor<T, 3>::From(x);
auto y_t = EigenTensor<T, 3>::From(y);
auto d1_t = EigenTensor<T, 3>::From(d1);
auto d2_t = EigenTensor<T, 3>::From(d2);
auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
for (int i = 0; i < n; i++) {
for (int k = 0; k < out_h; k++) {
for (int l = 0; l < out_w; l++) {
if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1),
(T)(in_h - 1))) {
for (int j = 0; j < c; j++) {
input_grad_t(i, j, static_cast<int>(round(y_t(i, k, l))),
static_cast<int>(round(x_t(i, k, l)))) +=
output_grad_t(i, j, k, l) * d1_t(i, k, l) * d2_t(i, k, l);
}
}
}
}
}
}
template <typename T>
static void gatherOutputGradToInputGrad(const Tensor& output_grad,
Tensor* input_grad, const Tensor& x,
const Tensor& y) {
const int n = output_grad.dims()[0];
const int c = output_grad.dims()[1];
const int out_h = output_grad.dims()[2];
const int out_w = output_grad.dims()[3];
const int in_h = input_grad->dims()[2];
const int in_w = input_grad->dims()[3];
auto x_t = EigenTensor<T, 3>::From(x);
auto y_t = EigenTensor<T, 3>::From(y);
auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
for (int i = 0; i < n; i++) {
for (int k = 0; k < out_h; k++) {
for (int l = 0; l < out_w; l++) {
if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1),
(T)(in_h - 1))) {
for (int j = 0; j < c; j++) {
input_grad_t(i, j, static_cast<int>(round(y_t(i, k, l))),
static_cast<int>(round(x_t(i, k, l)))) +=
output_grad_t(i, j, k, l);
}
}
}
}
}
}
template <typename T>
static void gatherBilinearGrad(const platform::CPUDeviceContext& ctx,
const Tensor& input, const Tensor& output_grad,
Tensor* grid_x, Tensor* grid_y,
Tensor* grid_x_scale, Tensor* grid_y_scale,
Tensor* input_grad, Tensor* grid_grad) {
const int n = grid_x->dims()[0];
const int out_h = grid_x->dims()[1];
const int out_w = grid_x->dims()[2];
const int c = input.dims()[1];
Tensor x_w, x_e, y_n, y_s;
Tensor d_w, d_e, d_n, d_s;
Tensor v_wn, v_en, v_ws, v_es;
allNeigbors<T>(ctx, input,
grid_x, // grid_x
grid_y, // grid_y
&x_w, &x_e, &y_n, &y_s, &d_w, &d_e, &d_n, &d_s, &v_wn, &v_en,
&v_ws, &v_es);
// gather output grad value to input grad by corner point coords and weight
gatherOutputGradToInputGrad<T>(output_grad, input_grad, x_w, y_n, d_e, d_s);
gatherOutputGradToInputGrad<T>(output_grad, input_grad, x_w, y_s, d_e, d_n);
gatherOutputGradToInputGrad<T>(output_grad, input_grad, x_e, y_n, d_w, d_s);
gatherOutputGradToInputGrad<T>(output_grad, input_grad, x_e, y_s, d_w, d_n);
auto v_wn_t = EigenTensor<T, 4>::From(v_wn);
auto v_en_t = EigenTensor<T, 4>::From(v_en);
auto v_ws_t = EigenTensor<T, 4>::From(v_ws);
auto v_es_t = EigenTensor<T, 4>::From(v_es);
auto d_w_t = EigenTensor<T, 3>::From(d_w);
auto d_e_t = EigenTensor<T, 3>::From(d_e);
auto d_n_t = EigenTensor<T, 3>::From(d_n);
auto d_s_t = EigenTensor<T, 3>::From(d_s);
auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
if (grid_grad != nullptr) {
Tensor grid_grad_x, grid_grad_y;
grid_grad_x.mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
grid_grad_y.mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
auto grid_grad_x_t =
EigenTensor<T, 3>::From(grid_grad_x).setConstant(static_cast<T>(0.0));
auto grid_grad_y_t =
EigenTensor<T, 3>::From(grid_grad_y).setConstant(static_cast<T>(0.0));
for (int i = 0; i < n; i++) {
for (int j = 0; j < c; j++) {
for (int k = 0; k < out_h; k++) {
for (int l = 0; l < out_w; l++) {
grid_grad_x_t(i, k, l) +=
((v_en_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_s_t(i, k, l) +
(v_es_t(i, j, k, l) - v_ws_t(i, j, k, l)) * d_n_t(i, k, l)) *
output_grad_t(i, j, k, l);
grid_grad_y_t(i, k, l) +=
((v_ws_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_e_t(i, k, l) +
(v_es_t(i, j, k, l) - v_en_t(i, j, k, l)) * d_w_t(i, k, l)) *
output_grad_t(i, j, k, l);
}
}
}
}
// const T x_max = static_cast<T>(in_w - 1);
// const T y_max = static_cast<T>(in_h - 1);
auto grid_x_scale_t = EigenTensor<T, 3>::From(*grid_x_scale);
auto grid_y_scale_t = EigenTensor<T, 3>::From(*grid_y_scale);
grid_grad_x_t = grid_grad_x_t * grid_x_scale_t;
grid_grad_y_t = grid_grad_y_t * grid_y_scale_t;
// gather grid_grad [x, y] in 3rd Dim
T* grid_grad_data = grid_grad->data<T>();
T* grid_grad_x_data = grid_grad_x.data<T>();
T* grid_grad_y_data = grid_grad_y.data<T>();
for (int i = 0; i < n * out_h * out_w; i++) {
grid_grad_data[2 * i] = grid_grad_x_data[i];
grid_grad_data[2 * i + 1] = grid_grad_y_data[i];
}
}
}
template <typename DeviceContext, typename T>
class GridSampleOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto align_corners = ctx.Attr<bool>("align_corners");
auto padding_mode = ctx.Attr<std::string>("padding_mode");
auto mode = ctx.Attr<std::string>("mode");
auto* input = ctx.Input<Tensor>("X");
auto* grid = ctx.Input<Tensor>("Grid");
const int n = grid->dims()[0];
const int out_h = grid->dims()[1];
const int out_w = grid->dims()[2];
const int c = input->dims()[1];
const int in_h = input->dims()[2];
const int in_w = input->dims()[3];
auto* output = ctx.Output<Tensor>("Output");
output->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
phi::funcs::SetConstant<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), output,
static_cast<T>(0));
Tensor grid_x, grid_y;
calcGridLocations<T>(
ctx.template device_context<platform::CPUDeviceContext>(), *grid, in_h,
in_w, align_corners, padding_mode, &grid_x, &grid_y);
if (mode == "bilinear") {
bilinearInter<T>(
ctx.template device_context<platform::CPUDeviceContext>(), *input,
&grid_x, &grid_y, output);
} else if (mode == "nearest") {
auto grid_x_t = EigenTensor<T, 3>::From(grid_x);
auto grid_y_t = EigenTensor<T, 3>::From(grid_y);
grid_x_t = grid_x_t.round();
grid_y_t = grid_y_t.round();
getGridPointValue<T>(*input, output, grid_x, grid_y);
}
}
};
template <typename DeviceContext, typename T>
class GridSampleGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto align_corners = ctx.Attr<bool>("align_corners");
auto padding_mode = ctx.Attr<std::string>("padding_mode");
auto mode = ctx.Attr<std::string>("mode");
auto* input = ctx.Input<Tensor>("X");
auto* grid = ctx.Input<Tensor>("Grid");
auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
const int n = grid->dims()[0];
const int out_h = grid->dims()[1];
const int out_w = grid->dims()[2];
const int c = input->dims()[1];
const int in_h = input->dims()[2];
const int in_w = input->dims()[3];
auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
input_grad->mutable_data<T>({n, c, in_h, in_w}, ctx.GetPlace());
phi::funcs::SetConstant<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), input_grad,
static_cast<T>(0));
Tensor* grid_grad = nullptr;
if (ctx.HasOutput(framework::GradVarName("Grid"))) {
grid_grad = ctx.Output<Tensor>(framework::GradVarName("Grid"));
grid_grad->mutable_data<T>({n, out_h, out_w, 2}, ctx.GetPlace());
phi::funcs::SetConstant<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), grid_grad,
static_cast<T>(0));
}
Tensor grid_x, grid_y;
Tensor grid_x_scale, grid_y_scale;
calcGridLocationsWithGrad<T>(
ctx.template device_context<platform::CPUDeviceContext>(), *grid, in_h,
in_w, align_corners, padding_mode, &grid_x, &grid_y, &grid_x_scale,
&grid_y_scale);
if (mode == "bilinear") {
gatherBilinearGrad<T>(ctx.template device_context<DeviceContext>(),
*input, *output_grad, &grid_x, &grid_y,
&grid_x_scale, &grid_y_scale, input_grad,
grid_grad);
} else {
auto grid_x_t = EigenTensor<T, 3>::From(grid_x);
auto grid_y_t = EigenTensor<T, 3>::From(grid_y);
grid_x_t = grid_x_t.round();
grid_y_t = grid_y_t.round();
gatherOutputGradToInputGrad<T>(*output_grad, input_grad, grid_x, grid_y);
}
}
};
} // namespace operators
} // namespace paddle
...@@ -13,8 +13,13 @@ ...@@ -13,8 +13,13 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/operators/index_select_op.h" #include "paddle/fluid/operators/index_select_op.h"
#include <memory> #include <memory>
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/infermeta/binary.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -24,52 +29,6 @@ class IndexSelectOp : public framework::OperatorWithKernel { ...@@ -24,52 +29,6 @@ class IndexSelectOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
platform::errors::InvalidArgument(
"Input(X) of IndexSelectOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasInput("Index"), true,
platform::errors::InvalidArgument(
"Input(Index) of IndexSelectOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
platform::errors::InvalidArgument(
"Output(Out) of IndexSelectOp should not be null."));
auto input_dim = ctx->GetInputDim("X");
auto index_dim = ctx->GetInputDim("Index");
auto dim = ctx->Attrs().Get<int>("dim");
PADDLE_ENFORCE_EQ(
dim < input_dim.size() && dim >= (0 - input_dim.size()), true,
platform::errors::OutOfRange(
"Attr(dim) is out of range, It's expected "
"to be in range of [-%d, %d]. But received Attr(dim) = %d.",
input_dim.size(), input_dim.size() - 1, dim));
PADDLE_ENFORCE_EQ(
index_dim.size() == 1 || (index_dim.size() == 2 && index_dim[1] == 1),
true, platform::errors::InvalidArgument(
"The 'shape' of Input(Index) must be 1-D tensor. "
"But received: the 'shape' of Input(Index) is [%s], "
"the dimension of Input(Index) is [%d].",
index_dim, index_dim.size()));
PADDLE_ENFORCE_EQ(index_dim[0] != 0, true,
platform::errors::InvalidArgument(
"The length of Input(Index) can't be 0."));
auto output_dim = phi::vectorize(input_dim);
if (dim < 0) {
dim += input_dim.size();
}
output_dim[dim] = index_dim[0];
ctx->SetOutputDim("Out", phi::make_ddim(output_dim));
auto type = ctx->GetInputsVarType("X")[0];
if (type == framework::proto::VarType::LOD_TENSOR) {
ctx->ShareLoD("X", /*->*/ "Out");
}
}
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
...@@ -148,20 +107,11 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(IndexSelectGradNoNeedBufferVarsInferer, ...@@ -148,20 +107,11 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(IndexSelectGradNoNeedBufferVarsInferer,
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
DECLARE_INFER_SHAPE_FUNCTOR(index_select, IndexSelectInferShapeFunctor,
PD_INFER_META(phi::IndexSelectInferMeta));
REGISTER_OPERATOR(index_select, ops::IndexSelectOp, ops::IndexSelectOpMaker, REGISTER_OPERATOR(index_select, ops::IndexSelectOp, ops::IndexSelectOpMaker,
ops::IndexSelectGradMaker<paddle::framework::OpDesc>, ops::IndexSelectGradMaker<paddle::framework::OpDesc>,
ops::IndexSelectGradMaker<paddle::imperative::OpBase>); ops::IndexSelectGradMaker<paddle::imperative::OpBase>,
IndexSelectInferShapeFunctor);
REGISTER_OPERATOR(index_select_grad, ops::IndexSelectGradOp, REGISTER_OPERATOR(index_select_grad, ops::IndexSelectGradOp,
ops::IndexSelectGradNoNeedBufferVarsInferer); ops::IndexSelectGradNoNeedBufferVarsInferer);
REGISTER_OP_CPU_KERNEL(
index_select,
ops::IndexSelectKernel<paddle::platform::CPUDeviceContext, float>,
ops::IndexSelectKernel<paddle::platform::CPUDeviceContext, double>,
ops::IndexSelectKernel<paddle::platform::CPUDeviceContext, int>,
ops::IndexSelectKernel<paddle::platform::CPUDeviceContext, int64_t>);
REGISTER_OP_CPU_KERNEL(
index_select_grad,
ops::IndexSelectGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::IndexSelectGradKernel<paddle::platform::CPUDeviceContext, double>,
ops::IndexSelectGradKernel<paddle::platform::CPUDeviceContext, int>,
ops::IndexSelectGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/index_select_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
namespace paddle {
namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS;
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
template <typename T, typename IndexT>
__global__ void index_select_cuda_kernel(const T* input, T* output,
const IndexT* index, int64_t N,
int64_t stride, int64_t size,
int64_t delta) {
int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= N) {
return;
}
int64_t pre_idx = idx / (stride * size);
int64_t dim_idx = idx % (stride * size) / stride;
IndexT src_dim_idx = index[dim_idx];
int64_t input_idx = idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride;
output[idx] = input[input_idx];
}
template <typename T, typename IndexT>
__global__ void index_select_grad_cuda_kernel(const T* output_grad,
T* input_grad,
const IndexT* index, int64_t nums,
int64_t N, int64_t stride,
int64_t size, int64_t delta) {
int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= N) {
return;
}
int64_t pre_idx = idx / (stride * size);
int64_t dim_idx = idx % (stride * size) / stride;
IndexT src_dim_idx = index[dim_idx];
int64_t input_idx = idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride;
paddle::platform::CudaAtomicAdd(&input_grad[input_idx], output_grad[idx]);
}
template <typename T>
__global__ void index_select_grad_init(T* input_grad, int64_t N) {
int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= N) {
return;
}
input_grad[idx] = 0.0;
}
template <typename DeviceContext, typename T>
class IndexSelectCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* in = context.Input<LoDTensor>("X");
auto* index = context.Input<LoDTensor>("Index");
auto* out = context.Output<LoDTensor>("Out");
int dim = context.Attr<int>("dim");
auto input_dim = in->dims();
auto output_dim = out->dims();
dim = dim >= 0 ? dim : dim + input_dim.size();
auto stride_dim = phi::stride(input_dim);
int64_t stride = stride_dim[dim];
int64_t size = output_dim[dim];
int64_t delta = input_dim[dim] - size;
const auto& index_type = framework::TransToProtoVarType(index->dtype());
bool index_type_match = index_type == framework::proto::VarType::INT64 ||
index_type == framework::proto::VarType::INT32;
PADDLE_ENFORCE_EQ(index_type_match, true,
platform::errors::InvalidArgument(
"Input(Index) holds the wrong type, it holds %s, but "
"desires to be %s or %s",
paddle::framework::DataTypeToString(index_type),
paddle::framework::DataTypeToString(
framework::proto::VarType::INT32),
paddle::framework::DataTypeToString(
framework::proto::VarType::INT64)));
auto* in_data = in->data<T>();
auto* out_data = out->mutable_data<T>(context.GetPlace());
int64_t numel = out->numel();
auto stream =
context.template device_context<platform::CUDADeviceContext>().stream();
if (index_type == framework::proto::VarType::INT64) {
const int64_t* index_data = index->data<int64_t>();
index_select_cuda_kernel<T, int64_t><<<
(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_data, out_data, index_data,
numel, stride, size, delta);
platform::GpuStreamSync(stream);
} else {
const int* index_data = index->data<int>();
index_select_cuda_kernel<T, int><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
PADDLE_CUDA_NUM_THREADS,
PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
in_data, out_data, index_data, numel, stride, size, delta);
platform::GpuStreamSync(stream);
}
}
};
template <typename DeviceContext, typename T>
class IndexSelectGradCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* output_grad = context.Input<LoDTensor>(framework::GradVarName("Out"));
auto* in_grad = context.Output<LoDTensor>(framework::GradVarName("X"));
auto* index = context.Input<LoDTensor>("Index");
auto* output_grad_data = output_grad->data<T>();
auto* in_grad_data = in_grad->mutable_data<T>(context.GetPlace());
int dim = context.Attr<int>("dim");
auto input_dim = in_grad->dims();
auto output_dim = output_grad->dims();
dim = dim >= 0 ? dim : dim + input_dim.size();
auto stride_dim = phi::stride(input_dim);
int64_t stride = stride_dim[dim];
int64_t size = output_dim[dim];
int64_t delta = input_dim[dim] - size;
const auto& index_type = framework::TransToProtoVarType(index->dtype());
bool index_type_match = index_type == framework::proto::VarType::INT64 ||
index_type == framework::proto::VarType::INT32;
PADDLE_ENFORCE_EQ(index_type_match, true,
platform::errors::InvalidArgument(
"Input(Index) holds the wrong type, it holds %s, but "
"desires to be %s or %s",
paddle::framework::DataTypeToString(index_type),
paddle::framework::DataTypeToString(
framework::proto::VarType::INT32),
paddle::framework::DataTypeToString(
framework::proto::VarType::INT64)));
int64_t numel = in_grad->numel();
int64_t index_nums = index->numel();
int64_t out_nums = output_grad->numel();
auto stream =
context.template device_context<platform::CUDADeviceContext>().stream();
index_select_grad_init<
T><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_grad_data, numel);
if (index_type == framework::proto::VarType::INT64) {
const int64_t* index_data = index->data<int64_t>();
index_select_grad_cuda_kernel<T, int64_t><<<
(out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
PADDLE_CUDA_NUM_THREADS, 0, stream>>>(output_grad_data, in_grad_data,
index_data, index_nums,
out_nums, stride, size, delta);
platform::GpuStreamSync(stream);
} else {
const int* index_data = index->data<int>();
index_select_grad_cuda_kernel<T, int><<<
(out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
PADDLE_CUDA_NUM_THREADS, 0, stream>>>(output_grad_data, in_grad_data,
index_data, index_nums,
out_nums, stride, size, delta);
platform::GpuStreamSync(stream);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
index_select,
ops::IndexSelectCUDAKernel<paddle::platform::CUDADeviceContext, float>,
ops::IndexSelectCUDAKernel<paddle::platform::CUDADeviceContext, double>,
ops::IndexSelectCUDAKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>,
ops::IndexSelectCUDAKernel<paddle::platform::CUDADeviceContext, int>,
ops::IndexSelectCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
REGISTER_OP_CUDA_KERNEL(
index_select_grad,
ops::IndexSelectGradCUDAKernel<paddle::platform::CUDADeviceContext, float>,
ops::IndexSelectGradCUDAKernel<paddle::platform::CUDADeviceContext, double>,
ops::IndexSelectGradCUDAKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>,
ops::IndexSelectGradCUDAKernel<paddle::platform::CUDADeviceContext, int>,
ops::IndexSelectGradCUDAKernel<paddle::platform::CUDADeviceContext,
int64_t>);
...@@ -91,41 +91,6 @@ void IndexSelectInner(const framework::ExecutionContext& context, ...@@ -91,41 +91,6 @@ void IndexSelectInner(const framework::ExecutionContext& context,
output->Resize(output_dim); output->Resize(output_dim);
} }
template <typename DeviceContext, typename T>
class IndexSelectKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto inputs = *context.Input<framework::LoDTensor>("X");
auto* index = context.Input<framework::LoDTensor>("Index");
auto* output = context.Output<framework::LoDTensor>("Out");
int dim = context.Attr<int>("dim");
if (dim < 0) {
dim += inputs.dims().size();
}
const auto& index_type = framework::TransToProtoVarType(index->dtype());
bool index_type_match = index_type == framework::proto::VarType::INT32 ||
index_type == framework::proto::VarType::INT64;
PADDLE_ENFORCE_EQ(index_type_match, true,
platform::errors::InvalidArgument(
"Input(Index) holds the wrong type, it holds %s, but "
"desires to be %s or %s",
paddle::framework::DataTypeToString(index_type),
paddle::framework::DataTypeToString(
framework::proto::VarType::INT32),
paddle::framework::DataTypeToString(
framework::proto::VarType::INT64)));
if (index_type == framework::proto::VarType::INT32) {
IndexSelectInner<DeviceContext, T, int>(context, &inputs, *index, output,
dim);
} else if (index_type == framework::proto::VarType::INT64) {
IndexSelectInner<DeviceContext, T, int64_t>(context, &inputs, *index,
output, dim);
}
}
};
template <typename DeviceContext, typename T, class Enable = void> template <typename DeviceContext, typename T, class Enable = void>
struct IndexSelectAdd { struct IndexSelectAdd {
void operator()(const framework::ExecutionContext& ctx, int slice_size, void operator()(const framework::ExecutionContext& ctx, int slice_size,
...@@ -197,43 +162,5 @@ void IndexSelectGradInner(const framework::ExecutionContext& context, ...@@ -197,43 +162,5 @@ void IndexSelectGradInner(const framework::ExecutionContext& context,
x_grad->Resize(output_dim); x_grad->Resize(output_dim);
} }
template <typename DeviceContext, typename T>
class IndexSelectGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* x_grad =
context.Output<framework::LoDTensor>(framework::GradVarName("X"));
auto* index = context.Input<framework::LoDTensor>("Index");
auto* out_grad =
context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
int dim = context.Attr<int>("dim");
if (dim < 0) {
dim += out_grad->dims().size();
}
const auto& index_type = framework::TransToProtoVarType(index->dtype());
bool index_type_match = index_type == framework::proto::VarType::INT32 ||
index_type == framework::proto::VarType::INT64;
PADDLE_ENFORCE_EQ(index_type_match, true,
platform::errors::InvalidArgument(
"Input(Index) holds the wrong type, it holds %s, but "
"desires to be %s or %s",
paddle::framework::DataTypeToString(index_type),
paddle::framework::DataTypeToString(
framework::proto::VarType::INT32),
paddle::framework::DataTypeToString(
framework::proto::VarType::INT64)));
if (index_type == framework::proto::VarType::INT32) {
IndexSelectGradInner<DeviceContext, T, int>(context, *out_grad, *index,
x_grad, dim);
} else if (index_type == framework::proto::VarType::INT64) {
IndexSelectGradInner<DeviceContext, T, int64_t>(context, *out_grad,
*index, x_grad, dim);
}
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -12,12 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,12 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/index_select_op.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/npu/npu_op_runner.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor;
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class IndexSelectNPUKernel : public framework::OpKernel<T> { class IndexSelectNPUKernel : public framework::OpKernel<T> {
public: public:
......
...@@ -14,10 +14,13 @@ ...@@ -14,10 +14,13 @@
#include <cmath> #include <cmath>
#include <string> #include <string>
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/infermeta/binary.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -60,40 +63,6 @@ class IscloseOp : public framework::OperatorWithKernel { ...@@ -60,40 +63,6 @@ class IscloseOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "Isclose");
OP_INOUT_CHECK(ctx->HasInput("Other"), "Input", "Other", "Isclose");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Isclose");
auto input_dim = ctx->GetInputDim("Input");
auto other_dim = ctx->GetInputDim("Other");
PADDLE_ENFORCE_EQ(input_dim.size(), other_dim.size(),
platform::errors::PreconditionNotMet(
"Input(Input) and Input(Other) must have the same "
"dimension size."));
int n = input_dim.size();
bool is_runtime = ctx->IsRuntime();
for (int i = 0; i < n; i++) {
if (is_runtime) {
PADDLE_ENFORCE_EQ(input_dim[i], other_dim[i],
platform::errors::PreconditionNotMet(
"The value at dim %d of Input(Input) is not "
"equal to the Input(Other): %ld != %ld.",
i, input_dim[i], other_dim[i]));
} else {
if (!(input_dim[i] < 0 || other_dim[i] < 0)) {
PADDLE_ENFORCE_EQ(input_dim[i], other_dim[i],
platform::errors::PreconditionNotMet(
"The value at dim %d of Input(Input) is not "
"equal to the Input(Other): %ld != %ld.",
i, input_dim[i], other_dim[i]));
}
}
}
ctx->SetOutputDim("Out", input_dim);
}
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
...@@ -115,8 +84,10 @@ class IscloseOpVarTypeInference : public framework::VarTypeInference { ...@@ -115,8 +84,10 @@ class IscloseOpVarTypeInference : public framework::VarTypeInference {
namespace ops = paddle::operators; namespace ops = paddle::operators;
DECLARE_INFER_SHAPE_FUNCTOR(isclose, IscloseInferShapeFunctor,
PD_INFER_META(phi::ValueCompareInferMeta));
REGISTER_OPERATOR( REGISTER_OPERATOR(
isclose, ops::IscloseOp, ops::IscloseOpMaker, isclose, ops::IscloseOp, ops::IscloseOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>, paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>, paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
ops::IscloseOpVarTypeInference); ops::IscloseOpVarTypeInference, IscloseInferShapeFunctor);
...@@ -11,7 +11,9 @@ ...@@ -11,7 +11,9 @@
#include <memory> #include <memory>
#include <string> #include <string>
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/infermeta/binary.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -21,44 +23,6 @@ using framework::Tensor; ...@@ -21,44 +23,6 @@ using framework::Tensor;
class KLDivLossOp : public framework::OperatorWithKernel { class KLDivLossOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "KLDivLoss");
OP_INOUT_CHECK(ctx->HasInput("Target"), "Input", "Target", "KLDivLoss");
OP_INOUT_CHECK(ctx->HasOutput("Loss"), "Output", "Loss", "KLDivLoss");
auto dim_x = ctx->GetInputDim("X");
auto dim_target = ctx->GetInputDim("Target");
PADDLE_ENFORCE_EQ(dim_x.size(), dim_target.size(),
platform::errors::InvalidArgument(
"Input(X) rank and Input(Target) rank should be "
"same, but received X rank(%d) != Target rank(%d)",
dim_x.size(), dim_target.size()));
for (int i = 0; i < dim_x.size(); i++) {
if (ctx->IsRuntime() || (dim_x[i] > 0 && dim_target[i] > 0)) {
PADDLE_ENFORCE_EQ(
dim_x[i], dim_target[i],
platform::errors::InvalidArgument(
"Input(X) and Input(Target) should in same shape. but received "
"X dimension[%d](%d) != Target dimension[%d](%d)",
i, dim_x[i], i, dim_target[i]));
}
}
auto reduction = ctx->Attrs().Get<std::string>("reduction");
auto reduction_valid = "mean" == reduction || "sum" == reduction ||
"batchmean" == reduction || "none" == reduction;
PADDLE_ENFORCE_EQ(
reduction_valid, true,
platform::errors::InvalidArgument(
"Attr(reduction) can only be 'none'|'batchmean'|'sum'|'mean'."));
if ("none" == reduction) {
ctx->SetOutputDim("Loss", dim_x);
} else {
ctx->SetOutputDim("Loss", {1});
}
}
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
...@@ -171,8 +135,12 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(KLDivLossGradNoNeedBufferVarInferer, "X"); ...@@ -171,8 +135,12 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(KLDivLossGradNoNeedBufferVarInferer, "X");
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
DECLARE_INFER_SHAPE_FUNCTOR(kldiv_loss, KLDivInferShapeFunctor,
PD_INFER_META(phi::KLDivInferMeta));
REGISTER_OPERATOR(kldiv_loss, ops::KLDivLossOp, ops::KLDivLossOpMaker, REGISTER_OPERATOR(kldiv_loss, ops::KLDivLossOp, ops::KLDivLossOpMaker,
ops::KLDivLossOpGradMaker<paddle::framework::OpDesc>, ops::KLDivLossOpGradMaker<paddle::framework::OpDesc>,
ops::KLDivLossOpGradMaker<paddle::imperative::OpBase>); ops::KLDivLossOpGradMaker<paddle::imperative::OpBase>,
KLDivInferShapeFunctor);
REGISTER_OPERATOR(kldiv_loss_grad, ops::KLDivLossOpGrad, REGISTER_OPERATOR(kldiv_loss_grad, ops::KLDivLossOpGrad,
ops::KLDivLossGradNoNeedBufferVarInferer); ops::KLDivLossGradNoNeedBufferVarInferer);
...@@ -18,8 +18,8 @@ limitations under the License. */ ...@@ -18,8 +18,8 @@ limitations under the License. */
#include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/framework/phi_utils.h"
#include "paddle/fluid/operators/set_value_op.h" #include "paddle/fluid/operators/set_value_op.h"
#include "paddle/fluid/operators/svd_helper.h" #include "paddle/fluid/operators/svd_helper.h"
#include "paddle/fluid/operators/tril_triu_op.h"
#include "paddle/phi/kernels/funcs/lapack/lapack_function.h" #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
#include "paddle/phi/kernels/funcs/tril_triu_compute.h"
#include "paddle/phi/kernels/math_kernel.h" #include "paddle/phi/kernels/math_kernel.h"
#include "paddle/phi/kernels/triangular_solve_kernel.h" #include "paddle/phi/kernels/triangular_solve_kernel.h"
...@@ -404,11 +404,12 @@ void LU_Unpack(const DeviceContext& dev_ctx, const framework::Tensor* LU, ...@@ -404,11 +404,12 @@ void LU_Unpack(const DeviceContext& dev_ctx, const framework::Tensor* LU,
const auto W = udims[udims.size() - 1]; const auto W = udims[udims.size() - 1];
auto L_dataptr = L->mutable_data<T>(dev_ctx.GetPlace()); auto L_dataptr = L->mutable_data<T>(dev_ctx.GetPlace());
platform::ForRange<DeviceContext> x_for_range(dev_ctx, LU->numel()); platform::ForRange<DeviceContext> x_for_range(dev_ctx, LU->numel());
TrilTriuCompute<T> tril_computer(LU->data<T>(), -1, true, H, W, L_dataptr); phi::funcs::TrilTriuCompute<T> tril_computer(LU->data<T>(), -1, true, H, W,
L_dataptr);
x_for_range(tril_computer); x_for_range(tril_computer);
TrilTriuCompute<T> triu_computer(LU->data<T>(), 0, false, H, W, phi::funcs::TrilTriuCompute<T> triu_computer(
U->mutable_data<T>(dev_ctx.GetPlace())); LU->data<T>(), 0, false, H, W, U->mutable_data<T>(dev_ctx.GetPlace()));
x_for_range(triu_computer); x_for_range(triu_computer);
// set L's diagonal 1 // set L's diagonal 1
...@@ -532,14 +533,14 @@ class LUGradKernel : public framework::OpKernel<T> { ...@@ -532,14 +533,14 @@ class LUGradKernel : public framework::OpKernel<T> {
auto phil_rank = LmHdims.size(); auto phil_rank = LmHdims.size();
auto phiu_rank = UmHdims.size(); auto phiu_rank = UmHdims.size();
platform::ForRange<DeviceContext> l_for_range(dev_ctx, phi_L.numel()); platform::ForRange<DeviceContext> l_for_range(dev_ctx, phi_L.numel());
TrilTriuCompute<T> tril_computer(phi_L.data<T>(), -1, true, phi::funcs::TrilTriuCompute<T> tril_computer(
LmHdims[phil_rank - 2], phi_L.data<T>(), -1, true, LmHdims[phil_rank - 2],
LmHdims[phil_rank - 1], phi_L.data<T>()); LmHdims[phil_rank - 1], phi_L.data<T>());
l_for_range(tril_computer); l_for_range(tril_computer);
platform::ForRange<DeviceContext> u_for_range(dev_ctx, phi_U.numel()); platform::ForRange<DeviceContext> u_for_range(dev_ctx, phi_U.numel());
TrilTriuCompute<T> triu_computer(phi_U.data<T>(), 0, false, phi::funcs::TrilTriuCompute<T> triu_computer(
UmHdims[phiu_rank - 2], phi_U.data<T>(), 0, false, UmHdims[phiu_rank - 2],
UmHdims[phiu_rank - 1], phi_U.data<T>()); UmHdims[phiu_rank - 1], phi_U.data<T>());
u_for_range(triu_computer); u_for_range(triu_computer);
...@@ -591,8 +592,9 @@ class LUGradKernel : public framework::OpKernel<T> { ...@@ -591,8 +592,9 @@ class LUGradKernel : public framework::OpKernel<T> {
const auto W = phidims[phidims.size() - 1]; const auto W = phidims[phidims.size() - 1];
platform::ForRange<DeviceContext> x_for_range(dev_ctx, platform::ForRange<DeviceContext> x_for_range(dev_ctx,
phi_complement.numel()); phi_complement.numel());
TrilTriuCompute<T> tril_computer(phi_complement.data<T>(), -1, true, H, phi::funcs::TrilTriuCompute<T> tril_computer(
W, phi_complement_l.data<T>()); phi_complement.data<T>(), -1, true, H, W,
phi_complement_l.data<T>());
x_for_range(tril_computer); x_for_range(tril_computer);
Tensor_Sub<DeviceContext, T>(dev_ctx, phi, phi_complement_l, &phi); Tensor_Sub<DeviceContext, T>(dev_ctx, phi, phi_complement_l, &phi);
...@@ -664,8 +666,8 @@ class LUGradKernel : public framework::OpKernel<T> { ...@@ -664,8 +666,8 @@ class LUGradKernel : public framework::OpKernel<T> {
const auto W = phidims[phidims.size() - 1]; const auto W = phidims[phidims.size() - 1];
platform::ForRange<DeviceContext> x_for_range(dev_ctx, platform::ForRange<DeviceContext> x_for_range(dev_ctx,
phi_complement.numel()); phi_complement.numel());
TrilTriuCompute<T> triu_computer(phi_complement.data<T>(), 0, false, H, W, phi::funcs::TrilTriuCompute<T> triu_computer(
phi_complement_u.data<T>()); phi_complement.data<T>(), 0, false, H, W, phi_complement_u.data<T>());
x_for_range(triu_computer); x_for_range(triu_computer);
Tensor_Sub<DeviceContext, T>(dev_ctx, phi, phi_complement_u, &phi); Tensor_Sub<DeviceContext, T>(dev_ctx, phi, phi_complement_u, &phi);
......
...@@ -16,7 +16,8 @@ limitations under the License. */ ...@@ -16,7 +16,8 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/lu_op.h" #include "paddle/fluid/operators/lu_op.h"
#include "paddle/fluid/operators/tril_triu_op.h" #include "paddle/fluid/platform/for_range.h"
#include "paddle/phi/kernels/funcs/tril_triu_compute.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -87,7 +88,8 @@ class LU_UnpackGradKernel : public framework::OpKernel<T> { ...@@ -87,7 +88,8 @@ class LU_UnpackGradKernel : public framework::OpKernel<T> {
auto W = ldims[ldims.size() - 1]; auto W = ldims[ldims.size() - 1];
auto L_dataptr = dl_tril.mutable_data<T>(dev_ctx.GetPlace()); auto L_dataptr = dl_tril.mutable_data<T>(dev_ctx.GetPlace());
platform::ForRange<DeviceContext> l_for_range(dev_ctx, dl->numel()); platform::ForRange<DeviceContext> l_for_range(dev_ctx, dl->numel());
TrilTriuCompute<T> tril_computer(dl->data<T>(), -1, true, H, W, L_dataptr); phi::funcs::TrilTriuCompute<T> tril_computer(dl->data<T>(), -1, true, H, W,
L_dataptr);
l_for_range(tril_computer); l_for_range(tril_computer);
const auto udims = du->dims(); const auto udims = du->dims();
...@@ -96,7 +98,8 @@ class LU_UnpackGradKernel : public framework::OpKernel<T> { ...@@ -96,7 +98,8 @@ class LU_UnpackGradKernel : public framework::OpKernel<T> {
W = udims[udims.size() - 1]; W = udims[udims.size() - 1];
auto U_dataptr = du_triu.mutable_data<T>(dev_ctx.GetPlace()); auto U_dataptr = du_triu.mutable_data<T>(dev_ctx.GetPlace());
platform::ForRange<DeviceContext> u_for_range(dev_ctx, du->numel()); platform::ForRange<DeviceContext> u_for_range(dev_ctx, du->numel());
TrilTriuCompute<T> triu_computer(du->data<T>(), 0, false, H, W, U_dataptr); phi::funcs::TrilTriuCompute<T> triu_computer(du->data<T>(), 0, false, H, W,
U_dataptr);
u_for_range(triu_computer); u_for_range(triu_computer);
auto xdims = dx->dims(); auto xdims = dx->dims();
......
...@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/multiplex_op.h"
#include <memory> #include <memory>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -169,15 +169,3 @@ REGISTER_OPERATOR(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker, ...@@ -169,15 +169,3 @@ REGISTER_OPERATOR(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker,
ops::MultiplexGradMaker<paddle::framework::OpDesc>, ops::MultiplexGradMaker<paddle::framework::OpDesc>,
ops::MultiplexGradMaker<paddle::imperative::OpBase>); ops::MultiplexGradMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(multiplex_grad, ops::MultiplexGradOp); REGISTER_OPERATOR(multiplex_grad, ops::MultiplexGradOp);
REGISTER_OP_CPU_KERNEL(
multiplex,
ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, float>,
ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, double>,
ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, int>,
ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, int64_t>);
REGISTER_OP_CPU_KERNEL(
multiplex_grad,
ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, float>,
ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, double>,
ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, int>,
ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, int64_t>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/multiplex_op.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename Place, typename T>
class MultiplexGPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const {
auto ins = ctx.MultiInput<Tensor>("X");
auto* ids = ctx.Input<Tensor>("Ids");
auto* out = ctx.Output<Tensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
for (size_t i = 0; i < ins.size(); ++i) {
PADDLE_ENFORCE_GT(
ins[i]->numel(), 0,
platform::errors::OutOfRange(
"indexing will be out of bounds with size 0 for the %d-th input.",
i));
}
auto rows = ins[0]->dims()[0];
auto cols = ins[0]->numel() / rows;
// copy index to cpu
Tensor index_t_cpu;
paddle::framework::TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu);
auto* index = index_t_cpu.data<int32_t>();
auto stream = ctx.cuda_device_context().stream();
platform::CUDAPlace place = ctx.GetPlace();
for (auto i = 0; i < rows; i++) {
int32_t k = index[i];
PADDLE_ENFORCE_GE(k, 0, platform::errors::PreconditionNotMet(
"index must be nonnegative."));
PADDLE_ENFORCE_LT(static_cast<size_t>(k), ins.size(),
platform::errors::PreconditionNotMet(
"index exceeds the number of candidate tensors."));
memory::Copy(place, out->data<T>() + i * cols, place,
ins[k]->data<T>() + i * cols, cols * sizeof(T), stream);
}
}
};
template <typename Place, typename T>
class MultiplexGradGPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const {
auto* d_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* ids = ctx.Input<Tensor>("Ids");
auto d_ins = ctx.MultiOutput<Tensor>(framework::GradVarName("X"));
size_t idx = -1UL;
for (size_t i = 0; i < d_ins.size(); i++) {
if (d_ins[i]) {
d_ins[i]->mutable_data<T>(ctx.GetPlace());
auto t = framework::EigenVector<T>::Flatten(*d_ins[i]);
t.device(*ctx.template device_context<Place>().eigen_device()) =
t.constant(static_cast<T>(0));
idx = i;
}
}
if (idx == -1UL) return;
auto rows = d_ins[idx]->dims()[0];
auto cols = d_ins[idx]->numel() / rows;
// copy index to cpu
Tensor index_t_cpu;
paddle::framework::TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu);
auto* index = index_t_cpu.data<int32_t>();
auto stream = ctx.cuda_device_context().stream();
platform::CUDAPlace place = ctx.GetPlace();
for (auto i = 0; i < rows; i++) {
size_t k = static_cast<size_t>(index[i]);
if (d_ins[k]) {
memory::Copy(place, d_ins[k]->data<T>() + i * cols, place,
d_out->data<T>() + i * cols, cols * sizeof(T), stream);
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
multiplex,
ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, float>,
ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, double>,
ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, int>,
ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, int64_t>);
REGISTER_OP_CUDA_KERNEL(
multiplex_grad,
ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, float>,
ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, double>,
ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, int>,
ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, int64_t>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/memcpy.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class MultiplexCPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const {
auto ins = ctx.MultiInput<framework::Tensor>("X");
auto ids = ctx.Input<framework::Tensor>("Ids");
auto* out = ctx.Output<framework::Tensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
for (size_t i = 0; i < ins.size(); ++i) {
PADDLE_ENFORCE_GT(
ins[i]->numel(), 0,
platform::errors::OutOfRange(
"indexing will be out of bounds with size 0 for the %d-th input.",
i));
}
auto rows = ins[0]->dims()[0];
auto cols = ins[0]->numel() / rows;
auto index = ids->data<int32_t>();
platform::CPUPlace place = ctx.GetPlace();
for (auto i = 0; i < rows; i++) {
int32_t k = index[i];
PADDLE_ENFORCE_GE(k, 0, platform::errors::PreconditionNotMet(
"index must be nonnegative."));
PADDLE_ENFORCE_LT(static_cast<size_t>(k), ins.size(),
platform::errors::PreconditionNotMet(
"index exceeds the number of candidate tensors."));
memory::Copy(place, out->data<T>() + i * cols, place,
ins[k]->data<T>() + i * cols, cols * sizeof(T));
}
}
};
template <typename DeviceContext, typename T>
class MultiplexGradCPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const {
auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* ids = ctx.Input<framework::Tensor>("Ids");
auto d_ins =
ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
size_t idx = -1UL;
for (size_t i = 0; i < d_ins.size(); i++) {
if (d_ins[i]) {
d_ins[i]->mutable_data<T>(ctx.GetPlace());
auto t = framework::EigenVector<T>::Flatten(*d_ins[i]);
t.device(*ctx.template device_context<DeviceContext>().eigen_device()) =
t.constant(static_cast<T>(0));
idx = i;
}
}
if (idx == -1UL) return;
auto rows = d_ins[idx]->dims()[0];
auto cols = d_ins[idx]->numel() / rows;
auto* index = ids->data<int32_t>();
platform::CPUPlace place = ctx.GetPlace();
for (auto i = 0; i < rows; i++) {
size_t k = static_cast<size_t>(index[i]);
if (d_ins[k]) {
memory::Copy(place, d_ins[k]->data<T>() + i * cols, place,
d_out->data<T>() + i * cols, cols * sizeof(T));
}
}
}
};
} // namespace operators
} // namespace paddle
...@@ -145,8 +145,6 @@ REGISTER_OPERATOR(qr, ops::QrOp, ops::QrOpMaker, ...@@ -145,8 +145,6 @@ REGISTER_OPERATOR(qr, ops::QrOp, ops::QrOpMaker,
REGISTER_OPERATOR(qr_grad, ops::QrGradOp); REGISTER_OPERATOR(qr_grad, ops::QrGradOp);
REGISTER_OP_CPU_KERNEL(qr, ops::QrCPUKernel<float>, ops::QrCPUKernel<double>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
qr_grad, ops::QrGradKernel<paddle::platform::CPUDeviceContext, float>, qr_grad, ops::QrGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::QrGradKernel<paddle::platform::CPUDeviceContext, double>); ops::QrGradKernel<paddle::platform::CPUDeviceContext, double>);
...@@ -48,85 +48,6 @@ static inline std::tuple<bool, bool> _parse_qr_mode(std::string mode) { ...@@ -48,85 +48,6 @@ static inline std::tuple<bool, bool> _parse_qr_mode(std::string mode) {
return std::make_tuple(compute_q, reduced); return std::make_tuple(compute_q, reduced);
} }
template <typename T>
class QrCPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
bool compute_q;
bool reduced_mode;
const Tensor& x = *context.Input<Tensor>("X");
Tensor& q = *context.Output<Tensor>("Q");
Tensor& r = *context.Output<Tensor>("R");
std::string mode = context.Attr<std::string>("mode");
std::tie(compute_q, reduced_mode) = _parse_qr_mode(mode);
auto numel = x.numel();
PADDLE_ENFORCE_GT(numel, 0, platform::errors::PreconditionNotMet(
"The input of QR is empty."));
auto x_dims = x.dims();
int x_rank = x_dims.size();
int m = x_dims[x_rank - 2];
int n = x_dims[x_rank - 1];
int min_mn = std::min(m, n);
int k = reduced_mode ? min_mn : m;
int batch_size = numel / (m * n);
int x_stride = m * n;
int q_stride = m * k;
int r_stride = k * n;
auto* x_data = x.data<phi::dtype::Real<T>>();
T* q_data = nullptr;
if (compute_q) {
q_data = q.mutable_data<phi::dtype::Real<T>>(
context.GetPlace(),
size_t(batch_size * m * k * sizeof(phi::dtype::Real<T>)));
memset(q_data, 0,
size_t(batch_size * m * k * sizeof(phi::dtype::Real<T>)));
}
auto* r_data = r.mutable_data<phi::dtype::Real<T>>(
context.GetPlace(),
size_t(batch_size * k * n * sizeof(phi::dtype::Real<T>)));
memset(r_data, 0, size_t(batch_size * k * n * sizeof(phi::dtype::Real<T>)));
// Implement QR by calling Eigen
for (int i = 0; i < batch_size; ++i) {
const T* x_matrix_ptr = x_data + i * x_stride;
T* r_matrix_ptr = r_data + i * r_stride;
using EigenDynamicMatrix =
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
auto x_matrix = Eigen::Map<const EigenDynamicMatrix>(x_matrix_ptr, m, n);
Eigen::HouseholderQR<EigenDynamicMatrix> qr(x_matrix);
if (reduced_mode) {
auto qr_top_matrix = qr.matrixQR().block(0, 0, min_mn, n);
auto r_matrix_view =
qr_top_matrix.template triangularView<Eigen::Upper>();
auto r_matrix = EigenDynamicMatrix(r_matrix_view);
memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T));
} else {
auto r_matrix_view =
qr.matrixQR().template triangularView<Eigen::Upper>();
auto r_matrix = EigenDynamicMatrix(r_matrix_view);
memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T));
}
if (compute_q) {
T* q_matrix_ptr = q_data + i * q_stride;
if (reduced_mode) {
auto q_matrix =
qr.householderQ() * EigenDynamicMatrix::Identity(m, min_mn);
q_matrix.transposeInPlace();
memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T));
} else {
auto q_matrix =
qr.householderQ() * EigenDynamicMatrix::Identity(m, m);
q_matrix.transposeInPlace();
memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T));
}
}
}
}
};
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class QrGradKernel : public framework::OpKernel<T> { class QrGradKernel : public framework::OpKernel<T> {
public: public:
......
...@@ -9,9 +9,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -9,9 +9,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/roi_align_op.h"
#include <memory> #include <memory>
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/ternary.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -23,79 +26,6 @@ class ROIAlignOp : public framework::OperatorWithKernel { ...@@ -23,79 +26,6 @@ class ROIAlignOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
platform::errors::NotFound("Input(X) of ROIAlignOp "
"is not found."));
PADDLE_ENFORCE_EQ(ctx->HasInput("ROIs"), true,
platform::errors::NotFound("Input(ROIs) of ROIAlignOp "
"is not found."));
PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
platform::errors::NotFound("Output(Out) of ROIAlignOp "
"is not found."));
auto input_dims = ctx->GetInputDim("X");
auto rois_dims = ctx->GetInputDim("ROIs");
if (ctx->HasInput("RoisNum")) {
auto rois_num_dims = ctx->GetInputDim("RoisNum");
PADDLE_ENFORCE_EQ(
rois_num_dims.size(), 1,
platform::errors::InvalidArgument("The size of RoisNum should be 1"
", but received size = %d",
rois_num_dims.size()));
}
PADDLE_ENFORCE_EQ(
input_dims.size(), 4,
platform::errors::InvalidArgument(
"The format of Input(X) in"
"RoIAlignOp is NCHW. And the rank of input must be 4. "
"But received rank = %d",
input_dims.size()));
PADDLE_ENFORCE_EQ(rois_dims.size(), 2, platform::errors::InvalidArgument(
"The rank of Input(ROIs) "
"in RoIAlignOp should be 2. "
"But the rank of RoIs is %d",
rois_dims.size()));
if (ctx->IsRuntime()) {
PADDLE_ENFORCE_EQ(rois_dims[1], 4,
platform::errors::InvalidArgument(
"The second dimension "
"of Input(ROIs) should be 4. But received the "
"dimension = %d",
rois_dims[1]));
}
int pooled_height = ctx->Attrs().Get<int>("pooled_height");
int pooled_width = ctx->Attrs().Get<int>("pooled_width");
float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
PADDLE_ENFORCE_GT(pooled_height, 0,
platform::errors::InvalidArgument(
"The 'pooled_height' attribute in RoIAlignOp is "
"invalid. The height must be greater than 0. But "
"received 'pooled_height' = %d",
pooled_height));
PADDLE_ENFORCE_GT(pooled_width, 0,
platform::errors::InvalidArgument(
"The 'pooled_width' attribute in RoIAlignOp is "
"invalid. The width must be greater than 0. But "
"received 'pooled_width' = %d",
pooled_width));
PADDLE_ENFORCE_GT(spatial_scale, 0.0f,
platform::errors::InvalidArgument(
"The 'spatial_scale' attribute in RoIAlignOp is "
"invalid. The scale must be greater than 0. But "
"received 'spatial_scale' = %f",
spatial_scale));
auto out_dims = input_dims;
out_dims[0] = rois_dims[0];
out_dims[1] = input_dims[1];
out_dims[2] = pooled_height;
out_dims[3] = pooled_width;
ctx->SetOutputDim("Out", out_dims);
}
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
...@@ -221,17 +151,16 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(RoiAlignGradNoNeedBufVarsInferer, "X"); ...@@ -221,17 +151,16 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(RoiAlignGradNoNeedBufVarsInferer, "X");
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
DECLARE_INFER_SHAPE_FUNCTOR(roi_align, RoiAlignInferShapeFunctor,
PD_INFER_META(phi::RoiAlignInferMeta));
REGISTER_OPERATOR(roi_align, ops::ROIAlignOp, ops::ROIAlignOpMaker, REGISTER_OPERATOR(roi_align, ops::ROIAlignOp, ops::ROIAlignOpMaker,
ops::ROIAlignGradMaker<paddle::framework::OpDesc>, ops::ROIAlignGradMaker<paddle::framework::OpDesc>,
ops::ROIAlignGradMaker<paddle::imperative::OpBase>); ops::ROIAlignGradMaker<paddle::imperative::OpBase>,
RoiAlignInferShapeFunctor);
REGISTER_OPERATOR(roi_align_grad, ops::ROIAlignGradOp, REGISTER_OPERATOR(roi_align_grad, ops::ROIAlignGradOp,
ops::RoiAlignGradNoNeedBufVarsInferer); ops::RoiAlignGradNoNeedBufVarsInferer);
REGISTER_OP_CPU_KERNEL(
roi_align_grad,
ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, double>,
ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, int>);
REGISTER_OP_VERSION(roi_align) REGISTER_OP_VERSION(roi_align)
.AddCheckpoint( .AddCheckpoint(
R"ROC( R"ROC(
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <limits>
#include <numeric>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
template <class T>
void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
const T out_grad_this_bin, const T count,
T* batch_grad_data) {
int x_low, y_low, x_high, y_high;
T w1, w2, w3, w4;
if (y < -1.0 || y > height || x < -1.0 || x > width) {
w1 = w2 = w3 = w4 = 0;
x_low = x_high = y_low = y_high = -1;
return;
}
y = y <= 0 ? 0 : y;
x = x <= 0 ? 0 : x;
y_low = static_cast<int>(y);
x_low = static_cast<int>(x);
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = static_cast<T>(y_low);
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = static_cast<T>(x_low);
} else {
x_high = x_low + 1;
}
T ly = y - y_low, lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
T diff1 = out_grad_this_bin * w1 / count;
T diff2 = out_grad_this_bin * w2 / count;
T diff3 = out_grad_this_bin * w3 / count;
T diff4 = out_grad_this_bin * w4 / count;
if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
*(batch_grad_data + y_low * width + x_low) += diff1;
*(batch_grad_data + y_low * width + x_high) += diff2;
*(batch_grad_data + y_high * width + x_low) += diff3;
*(batch_grad_data + y_high * width + x_high) += diff4;
}
}
template <typename DeviceContext, typename T>
class CPUROIAlignGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* in = ctx.Input<framework::Tensor>("X");
auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
auto* out_grad =
ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
auto pooled_height = ctx.Attr<int>("pooled_height");
auto pooled_width = ctx.Attr<int>("pooled_width");
auto spatial_scale = ctx.Attr<float>("spatial_scale");
auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
auto in_dims = in->dims();
auto aligned = ctx.Attr<bool>("aligned");
int channels = in_dims[1];
int height = in_dims[2];
int width = in_dims[3];
int rois_num = rois->dims()[0];
if (!in_grad) {
return;
}
Tensor roi_batch_id_list;
roi_batch_id_list.Resize({rois_num});
int* roi_batch_id_data =
roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
int rois_batch_size;
if (ctx.HasInput("RoisNum")) {
auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
rois_batch_size = rois_num_t->numel();
auto* rois_num_data = rois_num_t->data<int>();
int start = 0;
for (int n = 0; n < rois_batch_size; ++n) {
for (int i = start; i < start + rois_num_data[n]; ++i) {
roi_batch_id_data[i] = n;
}
start += rois_num_data[n];
}
} else {
auto rois_lod = rois->lod().back();
rois_batch_size = rois_lod.size() - 1;
for (int n = 0; n < rois_batch_size; ++n) {
for (std::size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
roi_batch_id_data[i] = n;
}
}
}
in_grad->mutable_data<T>(ctx.GetPlace());
auto& dev_ctx = ctx.template device_context<DeviceContext>();
phi::funcs::SetConstant<DeviceContext, T> set_zero;
set_zero(dev_ctx, in_grad, static_cast<T>(0));
int output_grad_size = out_grad->numel();
if ((!out_grad->IsInitialized()) || (output_grad_size <= 0)) {
return;
}
const T* rois_data = rois->data<T>();
const T* out_grad_data = out_grad->data<T>();
T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
auto in_stride = phi::stride(in->dims());
auto roi_stride = phi::stride(rois->dims());
auto out_stride = phi::stride(out_grad->dims());
T roi_offset = aligned ? T(0.5) : 0;
for (int n = 0; n < rois_num; ++n) {
int roi_batch_idx = roi_batch_id_data[n];
T roi_xmin = rois_data[0] * spatial_scale - roi_offset;
T roi_ymin = rois_data[1] * spatial_scale - roi_offset;
T roi_xmax = rois_data[2] * spatial_scale - roi_offset;
T roi_ymax = rois_data[3] * spatial_scale - roi_offset;
T roi_width = roi_xmax - roi_xmin;
T roi_height = roi_ymax - roi_ymin;
roi_width = std::max(roi_width, static_cast<T>(1.));
roi_height = std::max(roi_height, static_cast<T>(1.));
if (!aligned) {
roi_width = std::max(roi_width, static_cast<T>(1.));
roi_height = std::max(roi_height, static_cast<T>(1.));
}
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
for (int c = 0; c < channels; ++c) {
T* batch_grad_data =
in_grad_data + roi_batch_idx * in_stride[0] + c * in_stride[1];
const T* batch_out_grad_data =
out_grad_data + n * out_stride[0] + c * out_stride[1];
for (int ph = 0; ph < pooled_height; ++ph) {
for (int pw = 0; pw < pooled_width; ++pw) {
int pool_index = ph * pooled_width + pw;
T out_grad_this_bin = batch_out_grad_data[pool_index];
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_height / pooled_height);
int roi_bin_grid_w = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_width / pooled_width);
T count = roi_bin_grid_h * roi_bin_grid_w;
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
const T y = roi_ymin + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h);
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const T x = roi_xmin + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
bilinear_interpolate_gradient(height, width, y, x,
out_grad_this_bin, count,
batch_grad_data);
}
}
}
}
}
rois_data += roi_stride[0];
}
}
};
} // namespace operators
} // namespace paddle
...@@ -12,13 +12,16 @@ ...@@ -12,13 +12,16 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/operators/roll_op.h"
#include <memory> #include <memory>
#include <vector> #include <vector>
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/platform/complex.h" #include "paddle/fluid/operators/utils.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/unary.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -29,43 +32,6 @@ class RollOp : public framework::OperatorWithKernel { ...@@ -29,43 +32,6 @@ class RollOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
platform::errors::InvalidArgument(
"Input(X) of RollOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
platform::errors::InvalidArgument(
"Output(Out) of RollOp should not be null."));
auto dims = ctx->Attrs().Get<std::vector<int64_t>>("axis");
auto shifts = ctx->Attrs().Get<std::vector<int64_t>>("shifts");
if (!ctx->HasInput("ShiftsTensor")) {
if (dims.size() != 0) {
PADDLE_ENFORCE_EQ(dims.size(), shifts.size(),
platform::errors::InvalidArgument(
"When dims.size() != 0, dims.size() "
"should be equal to "
"shifts.size(). But received "
"dims.size() = %d, shifts.size() = %d",
dims.size(), shifts.size()));
} else {
PADDLE_ENFORCE_EQ(shifts.size(), 1,
platform::errors::InvalidArgument(
"When dims.size() == 0, shifts.size() "
"should be equal to 1, But received "
"shifts.size() = %d",
shifts.size()));
}
}
ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
auto type = ctx->GetInputsVarType("X")[0];
if (type == framework::proto::VarType::LOD_TENSOR) {
ctx->ShareLoD("X", /*->*/ "Out");
}
}
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
...@@ -149,29 +115,15 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(RollGradNoNeedBufferVarsInferer, "X"); ...@@ -149,29 +115,15 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(RollGradNoNeedBufferVarsInferer, "X");
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
DECLARE_INFER_SHAPE_FUNCTOR(roll, RollInferShapeFunctor,
PD_INFER_META(phi::RollInferMeta));
REGISTER_OPERATOR(roll, ops::RollOp, ops::RollOpMaker, REGISTER_OPERATOR(roll, ops::RollOp, ops::RollOpMaker,
ops::RollGradMaker<paddle::framework::OpDesc>, ops::RollGradMaker<paddle::framework::OpDesc>,
ops::RollGradMaker<paddle::imperative::OpBase>); ops::RollGradMaker<paddle::imperative::OpBase>,
RollInferShapeFunctor);
REGISTER_OPERATOR(roll_grad, ops::RollGradOp, REGISTER_OPERATOR(roll_grad, ops::RollGradOp,
ops::RollGradNoNeedBufferVarsInferer); ops::RollGradNoNeedBufferVarsInferer);
REGISTER_OP_CPU_KERNEL(
roll, ops::RollKernel<paddle::platform::CPUDeviceContext, float>,
ops::RollKernel<paddle::platform::CPUDeviceContext, double>,
ops::RollKernel<paddle::platform::CPUDeviceContext, int>,
ops::RollKernel<paddle::platform::CPUDeviceContext, int64_t>,
ops::RollKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<float>>,
ops::RollKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<double>>);
REGISTER_OP_CPU_KERNEL(
roll_grad, ops::RollGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::RollGradKernel<paddle::platform::CPUDeviceContext, double>,
ops::RollGradKernel<paddle::platform::CPUDeviceContext, int>,
ops::RollGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
ops::RollGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<float>>,
ops::RollGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<double>>);
REGISTER_OP_VERSION(roll) REGISTER_OP_VERSION(roll)
.AddCheckpoint( .AddCheckpoint(
......
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/roll_op.h"
#include "paddle/fluid/platform/complex.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/core/utils/array.h"
namespace paddle {
namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS;
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
template <typename T, size_t Rank>
__global__ void RollCudaKernel(const T* input, T* output, int64_t N,
phi::Array<int64_t, Rank> shifts,
phi::Array<int64_t, Rank> strides,
phi::Array<int64_t, Rank> sizes) {
int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= N) {
return;
}
int64_t output_idx = idx;
int64_t new_dim_idx = 0;
#pragma unroll
for (size_t i = 0; i < Rank; i++) {
new_dim_idx = (idx / strides[i]) % sizes[i] + shifts[i];
if (new_dim_idx >= sizes[i]) {
output_idx += (shifts[i] - sizes[i]) * strides[i];
} else {
output_idx += shifts[i] * strides[i];
}
}
output[output_idx] = input[idx];
}
template <typename T>
class RollKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* in = context.Input<LoDTensor>("X");
auto* out = context.Output<LoDTensor>("Out");
std::vector<int64_t> shifts = context.Attr<std::vector<int64_t>>("shifts");
if (context.HasInput("ShiftsTensor")) {
const auto* shifts_tensor =
context.Input<framework::Tensor>("ShiftsTensor");
PADDLE_ENFORCE_EQ(
shifts_tensor->dims().size(), 1,
platform::errors::InvalidArgument(
"The rank of ShiftsTensor is expected to be 1, got %s",
shifts_tensor->dims().size()));
shifts = GetDataFromTensor<int64_t>(shifts_tensor);
}
std::vector<int64_t> dims = context.Attr<std::vector<int64_t>>("axis");
auto* in_data = in->data<T>();
auto* out_data = out->mutable_data<T>(context.GetPlace());
int64_t numel = in->numel();
auto stream =
context.template device_context<platform::CUDADeviceContext>().stream();
size_t nums = shifts.size();
auto input_dim = in->dims();
auto stride_dim = phi::stride(input_dim);
std::vector<int64_t> strides(nums), sizes(nums);
if (dims.size() == 0) {
strides[0] = 1;
sizes[0] = numel;
shifts[0] = (shifts[0] % numel + numel) % numel;
} else {
for (size_t i = 0; i < nums; i++) {
int dim = dims[i] >= 0 ? dims[i] : dims[i] + input_dim.size();
int64_t size = input_dim[dim];
if (size != 0) {
shifts[i] = (shifts[i] % size + size) % size;
strides[i] = stride_dim[dim];
sizes[i] = size;
}
}
}
#define CALL_ROLL_CUDA_KERNEL(N) \
case N: { \
phi::Array<int64_t, N> _strides; \
phi::Array<int64_t, N> _shifts; \
phi::Array<int64_t, N> _sizes; \
for (size_t idx = 0; idx < N; ++idx) { \
_strides[idx] = strides[idx]; \
_shifts[idx] = shifts[idx]; \
_sizes[idx] = sizes[idx]; \
} \
RollCudaKernel< \
T, \
N><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, \
PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_data, out_data, numel, \
_shifts, _strides, _sizes); \
break; \
}
switch (nums) {
CALL_ROLL_CUDA_KERNEL(1);
CALL_ROLL_CUDA_KERNEL(2);
CALL_ROLL_CUDA_KERNEL(3);
CALL_ROLL_CUDA_KERNEL(4);
CALL_ROLL_CUDA_KERNEL(5);
CALL_ROLL_CUDA_KERNEL(6);
CALL_ROLL_CUDA_KERNEL(7);
CALL_ROLL_CUDA_KERNEL(8);
CALL_ROLL_CUDA_KERNEL(9);
default:
PADDLE_THROW(platform::errors::InvalidArgument(
"shifts.size() should be less than 10, But received shifts.size() "
"= %d",
shifts.size()));
}
}
};
template <typename T>
class RollGradKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* in = context.Input<LoDTensor>(framework::GradVarName("Out"));
auto* out = context.Output<LoDTensor>(framework::GradVarName("X"));
std::vector<int64_t> shifts = context.Attr<std::vector<int64_t>>("shifts");
if (context.HasInput("ShiftsTensor")) {
const auto* shifts_tensor =
context.Input<framework::Tensor>("ShiftsTensor");
PADDLE_ENFORCE_EQ(
shifts_tensor->dims().size(), 1,
platform::errors::InvalidArgument(
"The rank of ShiftsTensor is expected to be 1, got %s",
shifts_tensor->dims().size()));
shifts = GetDataFromTensor<int64_t>(shifts_tensor);
}
std::vector<int64_t> dims = context.Attr<std::vector<int64_t>>("axis");
auto* in_data = in->data<T>();
auto* out_data = out->mutable_data<T>(context.GetPlace());
int64_t numel = in->numel();
auto stream =
context.template device_context<platform::CUDADeviceContext>().stream();
size_t nums = shifts.size();
auto input_dim = in->dims();
auto stride_dim = phi::stride(input_dim);
std::vector<int64_t> strides(nums), sizes(nums);
if (dims.size() == 0) {
strides[0] = 1;
sizes[0] = numel;
shifts[0] = ((-shifts[0]) % numel + numel) % numel;
} else {
for (size_t i = 0; i < nums; i++) {
int dim = dims[i] >= 0 ? dims[i] : dims[i] + input_dim.size();
int64_t size = input_dim[dim];
if (size != 0) {
shifts[i] = ((-shifts[i]) % size + size) % size;
strides[i] = stride_dim[dim];
sizes[i] = size;
}
}
}
switch (nums) {
CALL_ROLL_CUDA_KERNEL(1);
CALL_ROLL_CUDA_KERNEL(2);
CALL_ROLL_CUDA_KERNEL(3);
CALL_ROLL_CUDA_KERNEL(4);
CALL_ROLL_CUDA_KERNEL(5);
CALL_ROLL_CUDA_KERNEL(6);
CALL_ROLL_CUDA_KERNEL(7);
CALL_ROLL_CUDA_KERNEL(8);
CALL_ROLL_CUDA_KERNEL(9);
default:
PADDLE_THROW(platform::errors::InvalidArgument(
"shifts.size() should be less than 10, But received shifts.size() "
"= %d",
shifts.size()));
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
roll, ops::RollKernel<paddle::platform::CUDADeviceContext, float>,
ops::RollKernel<paddle::platform::CUDADeviceContext, double>,
ops::RollKernel<paddle::platform::CUDADeviceContext, int>,
ops::RollKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::RollKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<float>>,
ops::RollKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<double>>);
REGISTER_OP_CUDA_KERNEL(
roll_grad, ops::RollGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::RollGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::RollGradKernel<paddle::platform::CUDADeviceContext, int>,
ops::RollGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::RollGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<float>>,
ops::RollGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<double>>);
...@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/tril_triu_op.h"
#include <memory> #include <memory>
#include "paddle/fluid/framework/op_registry.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -104,19 +104,3 @@ REGISTER_OPERATOR(tril_triu, ops::TrilTriuOp, ops::TrilTriuOpMaker, ...@@ -104,19 +104,3 @@ REGISTER_OPERATOR(tril_triu, ops::TrilTriuOp, ops::TrilTriuOpMaker,
ops::TrilTriuGradOpMaker<paddle::framework::OpDesc>, ops::TrilTriuGradOpMaker<paddle::framework::OpDesc>,
ops::TrilTriuGradOpMaker<paddle::imperative::OpBase>); ops::TrilTriuGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(tril_triu_grad, ops::TrilTriuGradOp); REGISTER_OPERATOR(tril_triu_grad, ops::TrilTriuGradOp);
REGISTER_OP_CPU_KERNEL(
tril_triu, ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, bool>,
ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, double>,
ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, int>,
ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, int64_t>,
ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, plat::float16>);
REGISTER_OP_CPU_KERNEL(
tril_triu_grad,
ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, bool>,
ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, double>,
ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, int>,
ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, int64_t>,
ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext,
plat::float16>);
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/for_range.h"
namespace paddle {
namespace operators {
template <typename T>
class TrilTriuCompute {
public:
HOSTDEVICE TrilTriuCompute(const T* in, const int diagonal, const bool lower,
const int64_t H, const int64_t W, T* out)
: in_(in), diagonal_(diagonal), lower_(lower), H_(H), W_(W), out_(out) {}
HOSTDEVICE void operator()(int64_t idx) {
const int64_t row = (idx / W_) % H_;
const int64_t col = idx % W_;
const bool mask =
lower_ ? (col - row > diagonal_) : (col - row < diagonal_);
out_[idx] = mask ? static_cast<T>(0) : in_[idx];
}
private:
const T* in_;
const int diagonal_;
const bool lower_;
const int64_t H_;
const int64_t W_;
T* out_;
};
template <typename DeviceContext, typename T>
class TrilTriuOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const auto* x = context.Input<framework::Tensor>("X");
const auto* x_data = x->data<T>();
auto* out = context.Output<framework::Tensor>("Out");
auto* out_data = out->mutable_data<T>(context.GetPlace());
const int diagonal = context.Attr<int>("diagonal");
const bool lower = context.Attr<bool>("lower");
const auto& dims = x->dims();
const auto H = dims[dims.size() - 2];
const auto W = dims[dims.size() - 1];
platform::ForRange<DeviceContext> for_range(
context.template device_context<DeviceContext>(),
static_cast<size_t>(x->numel()));
paddle::operators::TrilTriuCompute<T> tril_triu_computer(
x_data, diagonal, lower, H, W, out_data);
for_range(tril_triu_computer);
}
};
template <typename DeviceContext, typename T>
class TrilTriuGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const auto* d_out =
context.Input<framework::Tensor>(framework::GradVarName("Out"));
const auto* dout_data = d_out->data<T>();
auto* d_x = context.Output<framework::Tensor>(framework::GradVarName("X"));
auto* dx_data = d_x->mutable_data<T>(context.GetPlace());
const int diagonal = context.Attr<int>("diagonal");
const bool lower = context.Attr<bool>("lower");
const auto& dims = d_out->dims();
const auto H = dims[dims.size() - 2];
const auto W = dims[dims.size() - 1];
platform::ForRange<DeviceContext> for_range(
context.template device_context<DeviceContext>(),
static_cast<size_t>(d_out->numel()));
paddle::operators::TrilTriuCompute<T> tril_triu_grad_computer(
dout_data, diagonal, lower, H, W, dx_data);
for_range(tril_triu_grad_computer);
}
};
} // namespace operators
} // namespace paddle
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/tril_triu_op.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/npu/npu_op_runner.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
namespace paddle { namespace paddle {
......
...@@ -11,7 +11,7 @@ limitations under the License. */ ...@@ -11,7 +11,7 @@ limitations under the License. */
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
#include "paddle/fluid/operators/tril_triu_op.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device/device_wrapper.h"
namespace paddle { namespace paddle {
......
...@@ -143,6 +143,7 @@ void BindNode(py::module *m) { ...@@ -143,6 +143,7 @@ void BindNode(py::module *m) {
.def("var", &Node::Var, return_value_policy::reference) .def("var", &Node::Var, return_value_policy::reference)
.def("op", &Node::Op, return_value_policy::reference) .def("op", &Node::Op, return_value_policy::reference)
.def("id", &Node::id) .def("id", &Node::id)
.def("graph_id", &Node::GraphId)
.def("original_desc_id", &Node::OriginalDescId) .def("original_desc_id", &Node::OriginalDescId)
.def("is_op", &Node::IsOp) .def("is_op", &Node::IsOp)
.def("is_var", &Node::IsVar) .def("is_var", &Node::IsVar)
......
...@@ -114,6 +114,7 @@ limitations under the License. */ ...@@ -114,6 +114,7 @@ limitations under the License. */
#include "paddle/fluid/pybind/metrics_py.h" #include "paddle/fluid/pybind/metrics_py.h"
#include "paddle/fluid/pybind/ps_gpu_wrapper_py.h" #include "paddle/fluid/pybind/ps_gpu_wrapper_py.h"
#include "paddle/fluid/pybind/pybind_boost_headers.h" #include "paddle/fluid/pybind/pybind_boost_headers.h"
#include "paddle/phi/backends/device_manager.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/pybind/nccl_wrapper_py.h" #include "paddle/fluid/pybind/nccl_wrapper_py.h"
...@@ -742,6 +743,11 @@ PYBIND11_MODULE(core_noavx, m) { ...@@ -742,6 +743,11 @@ PYBIND11_MODULE(core_noavx, m) {
// stored in this static instance to avoid illegal memory access. // stored in this static instance to avoid illegal memory access.
m.def("clear_kernel_factory", m.def("clear_kernel_factory",
[]() { phi::KernelFactory::Instance().kernels().clear(); }); []() { phi::KernelFactory::Instance().kernels().clear(); });
m.def("clear_device_manager", []() {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
phi::DeviceManager::Clear();
#endif
});
// NOTE(zjl): ctest would load environment variables at the beginning even // NOTE(zjl): ctest would load environment variables at the beginning even
// though we have not `import paddle.fluid as fluid`. So we add this API // though we have not `import paddle.fluid as fluid`. So we add this API
......
...@@ -134,6 +134,10 @@ mlir::Type InfrtDialect::parseType(::mlir::DialectAsmParser &parser) const { ...@@ -134,6 +134,10 @@ mlir::Type InfrtDialect::parseType(::mlir::DialectAsmParser &parser) const {
return DenseTensorType::get( return DenseTensorType::get(
parser.getContext(), *targetType, *precisionType, *layoutType); parser.getContext(), *targetType, *precisionType, *layoutType);
} }
if (keyword == "dense_tensor_map") {
return DenseTensorMapType::get(parser.getContext());
}
// Todo: parse other type // Todo: parse other type
return mlir::Type(); return mlir::Type();
} }
...@@ -156,7 +160,7 @@ void InfrtDialect::printType(::mlir::Type type, ...@@ -156,7 +160,7 @@ void InfrtDialect::printType(::mlir::Type type,
} }
// print DenseTensorType, for example: !infrt.dense_tensor<CPU, FP32, NCHW> // print DenseTensorType, for example: !infrt.dense_tensor<CPU, FP32, NCHW>
if (type.isa<infrt::DenseTensorType>()) { if (type.isa<DenseTensorType>()) {
auto dense_tensor_type = type.cast<infrt::DenseTensorType>(); auto dense_tensor_type = type.cast<infrt::DenseTensorType>();
os << "dense_tensor<" << dense_tensor_type.getTarget() << ", " os << "dense_tensor<" << dense_tensor_type.getTarget() << ", "
<< dense_tensor_type.getPrecision() << ", " << dense_tensor_type.getPrecision() << ", "
...@@ -164,6 +168,12 @@ void InfrtDialect::printType(::mlir::Type type, ...@@ -164,6 +168,12 @@ void InfrtDialect::printType(::mlir::Type type,
return; return;
} }
// print DenseTensorType, for example: !infrt.dense_tensor<CPU, FP32, NCHW>
if (type.isa<DenseTensorMapType>()) {
os << "dense_tensor_map";
return;
}
llvm_unreachable("unknown infrt type."); llvm_unreachable("unknown infrt type.");
} }
......
...@@ -18,7 +18,7 @@ ...@@ -18,7 +18,7 @@
#include "paddle/infrt/common/global.h" #include "paddle/infrt/common/global.h"
#include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h" #include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h"
#include "paddle/infrt/dialect/mlir_loader.h" #include "paddle/infrt/dialect/mlir_loader.h"
#include "paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h" #include "paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h"
int main(int argc, char** argv) { int main(int argc, char** argv) {
static llvm::cl::opt<std::string> input_file( static llvm::cl::opt<std::string> input_file(
......
...@@ -393,6 +393,11 @@ DeviceManager& DeviceManager::Instance() { ...@@ -393,6 +393,11 @@ DeviceManager& DeviceManager::Instance() {
return platform_manager; return platform_manager;
} }
void DeviceManager::Clear() {
Instance().device_map_.clear();
Instance().device_impl_map_.clear();
}
std::vector<std::string> ListAllLibraries(const std::string& library_dir) { std::vector<std::string> ListAllLibraries(const std::string& library_dir) {
std::vector<std::string> libraries; std::vector<std::string> libraries;
std::regex express(".*\\.so"); std::regex express(".*\\.so");
......
...@@ -158,6 +158,8 @@ class DeviceManager { ...@@ -158,6 +158,8 @@ class DeviceManager {
static std::vector<size_t> GetDeviceList(const std::string& device_type); static std::vector<size_t> GetDeviceList(const std::string& device_type);
static void Clear();
private: private:
DISABLE_COPY_AND_ASSIGN(DeviceManager); DISABLE_COPY_AND_ASSIGN(DeviceManager);
DeviceManager() {} DeviceManager() {}
......
...@@ -24,6 +24,10 @@ limitations under the License. */ ...@@ -24,6 +24,10 @@ limitations under the License. */
namespace phi { namespace phi {
// Common InferMeta Functions for backward operators.
//
// NOTE: The InferMeta Functions in this file are arranged in alphabetic order.
void BilinearTensorProductGradInferMeta(const MetaTensor& x, void BilinearTensorProductGradInferMeta(const MetaTensor& x,
const MetaTensor& y, const MetaTensor& y,
const MetaTensor& weight, const MetaTensor& weight,
......
...@@ -73,6 +73,51 @@ void AllValueCompareInferMeta(const MetaTensor& x, ...@@ -73,6 +73,51 @@ void AllValueCompareInferMeta(const MetaTensor& x,
out->set_dtype(DataType::BOOL); out->set_dtype(DataType::BOOL);
} }
void KLDivInferMeta(const MetaTensor& x,
const MetaTensor& label,
const std::string& reduction,
MetaTensor* out,
MetaConfig config) {
auto dim_x = x.dims();
auto dim_target = label.dims();
PADDLE_ENFORCE_EQ(dim_x.size(),
dim_target.size(),
phi::errors::InvalidArgument(
"Input(X) rank and Input(Target) rank should be "
"same, but received X rank(%d) != Target rank(%d)",
dim_x.size(),
dim_target.size()));
for (int i = 0; i < dim_x.size(); i++) {
if (config.is_runtime || (dim_x[i] > 0 && dim_target[i] > 0)) {
PADDLE_ENFORCE_EQ(
dim_x[i],
dim_target[i],
phi::errors::InvalidArgument(
"Input(X) and Input(Target) should in same shape. but received "
"X dimension[%d](%d) != Target dimension[%d](%d)",
i,
dim_x[i],
i,
dim_target[i]));
}
}
auto reduction_valid = "mean" == reduction || "sum" == reduction ||
"batchmean" == reduction || "none" == reduction;
PADDLE_ENFORCE_EQ(
reduction_valid,
true,
phi::errors::InvalidArgument(
"Attr(reduction) can only be 'none'|'batchmean'|'sum'|'mean'."));
if ("none" == reduction) {
out->set_dims(dim_x);
} else {
out->set_dims({1});
}
out->set_dtype(x.dtype());
}
void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) { void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
out->share_meta(x); out->share_meta(x);
} }
...@@ -431,6 +476,55 @@ void ElementwiseRawInferMeta(const MetaTensor& x, ...@@ -431,6 +476,55 @@ void ElementwiseRawInferMeta(const MetaTensor& x,
out->share_lod(x); out->share_lod(x);
} }
void GatherInferMeta(const MetaTensor& x,
const MetaTensor& index,
const Scalar& axis,
MetaTensor* out) {
auto index_dims = index.dims();
if (index_dims.size() == 2) {
PADDLE_ENFORCE_EQ(
index_dims[1],
1,
phi::errors::InvalidArgument(
"The last dim of index should be 1 when it is 2D, but we get %d",
index_dims[1]));
} else {
PADDLE_ENFORCE_EQ(
index_dims.size(),
1,
phi::errors::InvalidArgument(
"The index should be 1D, when it is not 2D, but we get %d",
index_dims.size()));
}
auto input_dim = x.dims();
auto axis_v = axis.to<int>();
if (axis.FromTensor() || axis_v == 0) {
// if axis.FromTensor(), we can not obtain correct shape of output
int batch_size = index_dims[0];
phi::DDim output_dims(input_dim);
output_dims[0] = batch_size;
out->set_dims(output_dims);
out->set_dtype(x.dtype());
out->share_lod(x);
} else {
int index_size = index_dims[0];
std::vector<int> out_dim_vec;
for (int i = 0; i < axis_v; i++) {
out_dim_vec.push_back(input_dim[i]);
}
out_dim_vec.push_back(index_size);
for (int i = axis_v + 1; i < input_dim.size(); i++) {
out_dim_vec.push_back(input_dim[i]);
}
auto output_dims = phi::make_ddim(out_dim_vec);
out->set_dims(output_dims);
out->set_dtype(x.dtype());
out->share_lod(x);
}
}
void GatherNdInferMeta(const MetaTensor& x, void GatherNdInferMeta(const MetaTensor& x,
const MetaTensor& index, const MetaTensor& index,
MetaTensor* out) { MetaTensor* out) {
...@@ -549,6 +643,49 @@ void IndexSampleInferMeta(const MetaTensor& x, ...@@ -549,6 +643,49 @@ void IndexSampleInferMeta(const MetaTensor& x,
out->share_lod(y); out->share_lod(y);
} }
void IndexSelectInferMeta(const MetaTensor& x,
const MetaTensor& index,
int dim,
MetaTensor* output) {
auto input_dim = x.dims();
auto index_dim = index.dims();
PADDLE_ENFORCE_EQ(
dim < input_dim.size() && dim >= (0 - input_dim.size()),
true,
phi::errors::OutOfRange(
"Attr(dim) is out of range, It's expected "
"to be in range of [-%d, %d]. But received Attr(dim) = %d.",
input_dim.size(),
input_dim.size() - 1,
dim));
PADDLE_ENFORCE_EQ(
index_dim.size() == 1 || (index_dim.size() == 2 && index_dim[1] == 1),
true,
phi::errors::InvalidArgument(
"The 'shape' of Input(Index) must be 1-D tensor. "
"But received: the 'shape' of Input(Index) is [%s], "
"the dimension of Input(Index) is [%d].",
index_dim,
index_dim.size()));
PADDLE_ENFORCE_EQ(
index_dim[0] != 0,
true,
phi::errors::InvalidArgument("The length of Input(Index) can't be 0."));
auto output_dim = phi::vectorize(input_dim);
if (dim < 0) {
dim += input_dim.size();
}
output_dim[dim] = index_dim[0];
output->set_dims(phi::make_ddim(output_dim));
output->set_dtype(x.dtype());
output->set_layout(x.layout());
output->share_lod(x);
}
void LogLossInferMeta(const MetaTensor& input, void LogLossInferMeta(const MetaTensor& input,
const MetaTensor& label, const MetaTensor& label,
float epsilon, float epsilon,
...@@ -813,6 +950,16 @@ void TriangularSolveInferMeta(const MetaTensor& x, ...@@ -813,6 +950,16 @@ void TriangularSolveInferMeta(const MetaTensor& x,
out->share_lod(y); out->share_lod(y);
} }
void ValueCompareInferMeta(const MetaTensor& x,
const MetaTensor& y,
MetaTensor* out,
MetaConfig config) {
detail::BinarySameInputDimsCheck(x, y, config);
out->set_dims(x.dims());
out->set_dtype(DataType::BOOL);
}
} // namespace phi } // namespace phi
PD_REGISTER_INFER_META_FN(add_raw, phi::ElementwiseRawInferMeta); PD_REGISTER_INFER_META_FN(add_raw, phi::ElementwiseRawInferMeta);
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/phi/common/scalar.h"
#include "paddle/phi/core/meta_tensor.h" #include "paddle/phi/core/meta_tensor.h"
namespace phi { namespace phi {
...@@ -28,12 +29,20 @@ namespace phi { ...@@ -28,12 +29,20 @@ namespace phi {
// NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good. // NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good.
// Because functions in this file not only can infer shape, but also need // Because functions in this file not only can infer shape, but also need
// infer lod or other useful data. // infer lod or other useful data.
//
// The InferMeta Functions in this file are arranged in alphabetic order.
void AllValueCompareInferMeta(const MetaTensor& x, void AllValueCompareInferMeta(const MetaTensor& x,
const MetaTensor& y, const MetaTensor& y,
MetaTensor* out, MetaTensor* out,
MetaConfig config = MetaConfig()); MetaConfig config = MetaConfig());
void KLDivInferMeta(const MetaTensor& x,
const MetaTensor& label,
const std::string& reduction,
MetaTensor* out,
MetaConfig config = MetaConfig());
void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out); void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out);
void BCELossInferMeta(const MetaTensor& input, void BCELossInferMeta(const MetaTensor& input,
...@@ -81,6 +90,11 @@ void ElementwiseRawInferMeta(const MetaTensor& x_meta, ...@@ -81,6 +90,11 @@ void ElementwiseRawInferMeta(const MetaTensor& x_meta,
int axis, int axis,
MetaTensor* out); MetaTensor* out);
void GatherInferMeta(const MetaTensor& x,
const MetaTensor& index,
const Scalar& axis,
MetaTensor* out);
void GatherNdInferMeta(const MetaTensor& x, void GatherNdInferMeta(const MetaTensor& x,
const MetaTensor& index, const MetaTensor& index,
MetaTensor* out); MetaTensor* out);
...@@ -101,6 +115,11 @@ void IndexSampleInferMeta(const MetaTensor& x, ...@@ -101,6 +115,11 @@ void IndexSampleInferMeta(const MetaTensor& x,
MetaTensor* out, MetaTensor* out,
MetaConfig config = MetaConfig()); MetaConfig config = MetaConfig());
void IndexSelectInferMeta(const MetaTensor& x,
const MetaTensor& index,
int dim,
MetaTensor* output);
void LogLossInferMeta(const MetaTensor& input, void LogLossInferMeta(const MetaTensor& input,
const MetaTensor& label, const MetaTensor& label,
float epsilon, float epsilon,
...@@ -136,4 +155,9 @@ void TriangularSolveInferMeta(const MetaTensor& x, ...@@ -136,4 +155,9 @@ void TriangularSolveInferMeta(const MetaTensor& x,
bool unitriangular, bool unitriangular,
MetaTensor* out); MetaTensor* out);
void ValueCompareInferMeta(const MetaTensor& x,
const MetaTensor& y,
MetaTensor* out,
MetaConfig config = MetaConfig());
} // namespace phi } // namespace phi
...@@ -18,6 +18,23 @@ limitations under the License. */ ...@@ -18,6 +18,23 @@ limitations under the License. */
#include "paddle/phi/core/meta_tensor.h" #include "paddle/phi/core/meta_tensor.h"
namespace phi { namespace phi {
// Common InferMeta Functions for multiary operators, The format like:
//
// 1. The number of input MetaTensor is more than 3:
// void [FunctionDesc|OpName]InferMeta(const MetaTensor& x,
// const MetaTensor& y,
// const MetaTensor& z,
// const MetaTensor& w,
// ...,
// MetaTensor* out) {}
//
// 2. There are `const vector<MetaTensor*>&` in params:
// void [FunctionDesc|OpName]InferMeta(const vector<MetaTensor*>& x,
// ...,
// MetaTensor* out) {}
//
// NOTE: The InferMeta Functions in this file are arranged in alphabetic order.
std::vector<DDim> GetMetaTensorsDim(const std::vector<MetaTensor*>& tensors); std::vector<DDim> GetMetaTensorsDim(const std::vector<MetaTensor*>& tensors);
void AdadeltaInferMeta(const MetaTensor& param, void AdadeltaInferMeta(const MetaTensor& param,
......
...@@ -27,6 +27,8 @@ namespace phi { ...@@ -27,6 +27,8 @@ namespace phi {
// NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good. // NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good.
// Because functions in this file not only can infer shape, but also need // Because functions in this file not only can infer shape, but also need
// infer lod or other useful data. // infer lod or other useful data.
//
// The InferMeta Functions in this file are arranged in alphabetic order.
void CreateInferMeta(const ScalarArray& shape, DataType dtype, MetaTensor* out); void CreateInferMeta(const ScalarArray& shape, DataType dtype, MetaTensor* out);
......
...@@ -322,6 +322,83 @@ void NllLossRawInferMeta(const MetaTensor& input, ...@@ -322,6 +322,83 @@ void NllLossRawInferMeta(const MetaTensor& input,
total_weight->set_dtype(input.dtype()); total_weight->set_dtype(input.dtype());
} }
void RoiAlignInferMeta(const MetaTensor& x,
const MetaTensor& boxes,
paddle::optional<const MetaTensor&> boxes_num,
int pooled_height,
int pooled_width,
float spatial_scale,
int sampling_ratio,
bool aligned,
MetaTensor* out,
MetaConfig config) {
auto input_dims = x.dims();
auto boxes_dims = boxes.dims();
if (boxes_num) {
auto boxes_num_dims = boxes_num->dims();
PADDLE_ENFORCE_EQ(
boxes_num_dims.size(),
1,
phi::errors::InvalidArgument("The size of RoisNum should be 1"
", but received size = %d",
boxes_num_dims.size()));
}
PADDLE_ENFORCE_EQ(input_dims.size(),
4,
phi::errors::InvalidArgument(
"The format of Input(X) in"
"RoIAlignOp is NCHW. And the rank of input must be 4. "
"But received rank = %d",
input_dims.size()));
PADDLE_ENFORCE_EQ(boxes_dims.size(),
2,
phi::errors::InvalidArgument("The rank of Input(ROIs) "
"in RoIAlignOp should be 2. "
"But the rank of RoIs is %d",
boxes_dims.size()));
if (config.is_runtime) {
PADDLE_ENFORCE_EQ(boxes_dims[1],
4,
phi::errors::InvalidArgument(
"The second dimension "
"of Input(ROIs) should be 4. But received the "
"dimension = %d",
boxes_dims[1]));
}
PADDLE_ENFORCE_GT(pooled_height,
0,
phi::errors::InvalidArgument(
"The 'pooled_height' attribute in RoIAlignOp is "
"invalid. The height must be greater than 0. But "
"received 'pooled_height' = %d",
pooled_height));
PADDLE_ENFORCE_GT(pooled_width,
0,
phi::errors::InvalidArgument(
"The 'pooled_width' attribute in RoIAlignOp is "
"invalid. The width must be greater than 0. But "
"received 'pooled_width' = %d",
pooled_width));
PADDLE_ENFORCE_GT(spatial_scale,
0.0f,
phi::errors::InvalidArgument(
"The 'spatial_scale' attribute in RoIAlignOp is "
"invalid. The scale must be greater than 0. But "
"received 'spatial_scale' = %f",
spatial_scale));
auto out_dims = input_dims;
out_dims[0] = boxes_dims[0];
out_dims[1] = input_dims[1];
out_dims[2] = pooled_height;
out_dims[3] = pooled_width;
out->set_dims(out_dims);
out->set_dtype(x.dtype());
}
void ScatterInferMeta(const MetaTensor& x, void ScatterInferMeta(const MetaTensor& x,
const MetaTensor& index, const MetaTensor& index,
const MetaTensor& updates, const MetaTensor& updates,
......
...@@ -30,6 +30,8 @@ namespace phi { ...@@ -30,6 +30,8 @@ namespace phi {
// Because functions in this file not only can infer shape, but also need // Because functions in this file not only can infer shape, but also need
// infer lod or other useful data. // infer lod or other useful data.
// //
// The InferMeta Functions in this file are arranged in alphabetic order.
void AccuracyInferMeta(const MetaTensor& out, void AccuracyInferMeta(const MetaTensor& out,
const MetaTensor& indice, const MetaTensor& indice,
const MetaTensor& label, const MetaTensor& label,
...@@ -71,6 +73,17 @@ void NllLossRawInferMeta(const MetaTensor& input, ...@@ -71,6 +73,17 @@ void NllLossRawInferMeta(const MetaTensor& input,
MetaTensor* total_weight, MetaTensor* total_weight,
MetaConfig config = MetaConfig()); MetaConfig config = MetaConfig());
void RoiAlignInferMeta(const MetaTensor& x,
const MetaTensor& boxes,
paddle::optional<const MetaTensor&> boxes_num,
int pooled_height,
int pooled_width,
float spatial_scale,
int sampling_ratio,
bool aligned,
MetaTensor* out,
MetaConfig config = MetaConfig());
void ScatterInferMeta(const MetaTensor& x, void ScatterInferMeta(const MetaTensor& x,
const MetaTensor& index, const MetaTensor& index,
const MetaTensor& updates, const MetaTensor& updates,
......
...@@ -1016,6 +1016,37 @@ void ReshapeWithXShapeInferMeta(const MetaTensor& x, ...@@ -1016,6 +1016,37 @@ void ReshapeWithXShapeInferMeta(const MetaTensor& x,
ReshapeInferMeta(x, shape, out, config); ReshapeInferMeta(x, shape, out, config);
} }
void RollInferMeta(const MetaTensor& x,
const ScalarArray& shifts,
const std::vector<int64_t>& axis,
MetaTensor* out) {
auto shifts_data = shifts.GetData();
if (axis.size() != 0) {
PADDLE_ENFORCE_EQ(
axis.size(),
shifts_data.size(),
phi::errors::InvalidArgument("When dims.size() != 0, dims.size() "
"should be equal to "
"shifts.size(). But received "
"dims.size() = %d, shifts.size() = %d",
axis.size(),
shifts_data.size()));
} else {
PADDLE_ENFORCE_EQ(
shifts_data.size(),
1,
phi::errors::InvalidArgument("When dims.size() == 0, shifts.size() "
"should be equal to 1, But received "
"shifts.size() = %d",
shifts_data.size()));
}
out->set_dims(x.dims());
out->share_lod(x);
out->set_dtype(x.dtype());
}
void ShapeInferMeta(const MetaTensor& input, MetaTensor* out) { void ShapeInferMeta(const MetaTensor& input, MetaTensor* out) {
auto in_dim = input.dims(); auto in_dim = input.dims();
out->set_dims(phi::make_ddim({in_dim.size()})); out->set_dims(phi::make_ddim({in_dim.size()}));
......
...@@ -31,6 +31,8 @@ class MetaConfig; ...@@ -31,6 +31,8 @@ class MetaConfig;
// NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good. // NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good.
// Because functions in this file not only can infer shape, but also need // Because functions in this file not only can infer shape, but also need
// infer lod or other useful data. // infer lod or other useful data.
//
// The InferMeta Functions in this file are arranged in alphabetic order.
void ArgMinMaxInferMeta(const MetaTensor& x, void ArgMinMaxInferMeta(const MetaTensor& x,
int64_t axis, int64_t axis,
...@@ -164,6 +166,11 @@ void ReshapeWithXShapeInferMeta(const MetaTensor& x, ...@@ -164,6 +166,11 @@ void ReshapeWithXShapeInferMeta(const MetaTensor& x,
MetaTensor* out, MetaTensor* out,
MetaConfig config = MetaConfig()); MetaConfig config = MetaConfig());
void RollInferMeta(const MetaTensor& x,
const ScalarArray& shifts,
const std::vector<int64_t>& axis,
MetaTensor* out);
void ShapeInferMeta(const MetaTensor& input, MetaTensor* out); void ShapeInferMeta(const MetaTensor& input, MetaTensor* out);
void ShardIndexInferMeta(const MetaTensor& in, void ShardIndexInferMeta(const MetaTensor& in,
......
...@@ -26,6 +26,23 @@ namespace phi { ...@@ -26,6 +26,23 @@ namespace phi {
const DenseTensor& dout, \ const DenseTensor& dout, \
DenseTensor* dx); DenseTensor* dx);
#define DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(name, attr) \
template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \
const DenseTensor& x, \
const DenseTensor& dout, \
float attr, \
DenseTensor* dx);
#define DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX(name, attr1, attr2) \
template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \
const DenseTensor& x, \
const DenseTensor& dout, \
float attr1, \
float attr2, \
DenseTensor* dx);
#define DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(name) \ #define DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(name) \
template <typename T, typename Context> \ template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \ void name##GradKernel(const Context& dev_ctx, \
...@@ -33,6 +50,14 @@ namespace phi { ...@@ -33,6 +50,14 @@ namespace phi {
const DenseTensor& dout, \ const DenseTensor& dout, \
DenseTensor* dx); DenseTensor* dx);
#define DECLARE_ACTIVATION_GRAD_KERNEL_WITH_ONE_ATTRS_DepOut(name, attr) \
template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \
const DenseTensor& out, \
const DenseTensor& dout, \
float attr, \
DenseTensor* dx);
template <typename T, typename Context> template <typename T, typename Context>
void ReluDoubleGradKernel(const Context& dev_ctx, void ReluDoubleGradKernel(const Context& dev_ctx,
const DenseTensor& out, const DenseTensor& out,
...@@ -59,34 +84,29 @@ void TanhTripleGradKernel(const Context& dev_ctx, ...@@ -59,34 +84,29 @@ void TanhTripleGradKernel(const Context& dev_ctx,
DenseTensor* d_ddx); DenseTensor* d_ddx);
template <typename T, typename Context> template <typename T, typename Context>
void BReluGradKernel(const Context& dev_ctx, void LeakyReluDoubleGradKernel(const Context& dev_ctx,
const DenseTensor& x, const DenseTensor& x,
const DenseTensor& dout, const DenseTensor& ddx,
float t_min, float alpha,
float t_max, DenseTensor* ddout);
DenseTensor* dx);
template <typename T, typename Context> template <typename T, typename Context>
void LeakyReluGradKernel(const Context& dev_ctx, void EluGradKernel(const Context& dev_ctx,
const DenseTensor& x, const DenseTensor& x,
const DenseTensor& out,
const DenseTensor& dout, const DenseTensor& dout,
float alpha, float alpha,
DenseTensor* dx); DenseTensor* dx);
template <typename T, typename Context> template <typename T, typename Context>
void LeakyReluDoubleGradKernel(const Context& dev_ctx, void EluDoubleGradKernel(const Context& dev_ctx,
const DenseTensor& x, const DenseTensor& x,
const DenseTensor& dout,
const DenseTensor& ddx, const DenseTensor& ddx,
float alpha, float alpha,
DenseTensor* dx,
DenseTensor* ddout); DenseTensor* ddout);
template <typename T, typename Context>
void ThresholdedReluGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& dout,
float threshold,
DenseTensor* dx);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Cos); DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Cos);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Tan); DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Tan);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Acos); DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Acos);
...@@ -98,7 +118,17 @@ DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Cosh); ...@@ -98,7 +118,17 @@ DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Cosh);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Asinh); DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Asinh);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Acosh); DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Acosh);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Atanh); DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Atanh);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX(TanhShrink);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Silu);
DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(Relu); DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(Relu);
DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(Tanh); DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(Tanh);
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(LeakyRelu, alpha)
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(ThresholdedRelu, threshold)
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(SoftShrink, lambda)
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(HardShrink, threshold)
DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX(BRelu, t_min, t_max)
} // namespace phi } // namespace phi
...@@ -24,6 +24,21 @@ namespace phi { ...@@ -24,6 +24,21 @@ namespace phi {
void name##Kernel( \ void name##Kernel( \
const Context& dev_ctx, const DenseTensor& x, DenseTensor* out); const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
#define DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(name, attr) \
template <typename T, typename Context> \
void name##Kernel(const Context& dev_ctx, \
const DenseTensor& x, \
float attr, \
DenseTensor* out);
#define DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(name, attr1, attr2) \
template <typename T, typename Context> \
void name##Kernel(const Context& dev_ctx, \
const DenseTensor& x, \
float attr1, \
float attr2, \
DenseTensor* out);
DECLARE_ACTIVATION_KERNEL(Cos) DECLARE_ACTIVATION_KERNEL(Cos)
DECLARE_ACTIVATION_KERNEL(Tan) DECLARE_ACTIVATION_KERNEL(Tan)
DECLARE_ACTIVATION_KERNEL(Acos) DECLARE_ACTIVATION_KERNEL(Acos)
...@@ -37,24 +52,15 @@ DECLARE_ACTIVATION_KERNEL(Acosh) ...@@ -37,24 +52,15 @@ DECLARE_ACTIVATION_KERNEL(Acosh)
DECLARE_ACTIVATION_KERNEL(Atanh) DECLARE_ACTIVATION_KERNEL(Atanh)
DECLARE_ACTIVATION_KERNEL(Relu) DECLARE_ACTIVATION_KERNEL(Relu)
DECLARE_ACTIVATION_KERNEL(Tanh) DECLARE_ACTIVATION_KERNEL(Tanh)
DECLARE_ACTIVATION_KERNEL(TanhShrink)
DECLARE_ACTIVATION_KERNEL(Silu)
template <typename T, typename Context> DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(LeakyRelu, alpha)
void BReluKernel(const Context& dev_ctx, DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu, threshold)
const DenseTensor& x, DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(SoftShrink, lambda)
float t_min, DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(HardShrink, threshold)
float t_max, DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Elu, alpha)
DenseTensor* out);
template <typename T, typename Context> DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(BRelu, t_min, t_max)
void LeakyReluKernel(const Context& dev_ctx,
const DenseTensor& x,
float alpha,
DenseTensor* out);
template <typename T, typename Context>
void ThresholdedReluKernel(const Context& dev_ctx,
const DenseTensor& x,
float threshold,
DenseTensor* out);
} // namespace phi } // namespace phi
...@@ -21,18 +21,18 @@ limitations under the License. */ ...@@ -21,18 +21,18 @@ limitations under the License. */
namespace phi { namespace phi {
#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(name, functor_class) \ #define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(name, functor_class) \
template <typename T, typename Context> \ template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \ void name##GradKernel(const Context& dev_ctx, \
const DenseTensor& x, \ const DenseTensor& x, \
const DenseTensor& dout, \ const DenseTensor& dout, \
DenseTensor* dx) { \ DenseTensor* dx) { \
functor_class<T> functor; \ funcs::functor_class<T> functor; \
ActivationGradImpl<T, Context, functor_class<T>>( \ ActivationGradImpl<T, Context, funcs::functor_class<T>>( \
dev_ctx, &x, nullptr, &dout, dx, functor); \ dev_ctx, &x, nullptr, &dout, dx, functor); \
} }
#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX( \ #define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX( \
name, functor_class, attr) \ name, functor_class, attr) \
template <typename T, typename Context> \ template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \ void name##GradKernel(const Context& dev_ctx, \
...@@ -40,14 +40,14 @@ namespace phi { ...@@ -40,14 +40,14 @@ namespace phi {
const DenseTensor& dout, \ const DenseTensor& dout, \
float attr, \ float attr, \
DenseTensor* dx) { \ DenseTensor* dx) { \
functor_class<T> functor; \ funcs::functor_class<T> functor; \
auto attrs = functor.GetAttrs(); \ auto attrs = functor.GetAttrs(); \
*(attrs[0].second) = attr; \ *(attrs[0].second) = attr; \
ActivationGradImpl<T, Context, functor_class<T>>( \ ActivationGradImpl<T, Context, funcs::functor_class<T>>( \
dev_ctx, &x, nullptr, &dout, dx, functor); \ dev_ctx, &x, nullptr, &dout, dx, functor); \
} }
#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX( \ #define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX( \
name, functor_class, attr1, attr2) \ name, functor_class, attr1, attr2) \
template <typename T, typename Context> \ template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \ void name##GradKernel(const Context& dev_ctx, \
...@@ -56,26 +56,26 @@ namespace phi { ...@@ -56,26 +56,26 @@ namespace phi {
float attr1, \ float attr1, \
float attr2, \ float attr2, \
DenseTensor* dx) { \ DenseTensor* dx) { \
functor_class<T> functor; \ funcs::functor_class<T> functor; \
auto attrs = functor.GetAttrs(); \ auto attrs = functor.GetAttrs(); \
*(attrs[0].second) = attr1; \ *(attrs[0].second) = attr1; \
*(attrs[1].second) = attr2; \ *(attrs[1].second) = attr2; \
ActivationGradImpl<T, Context, functor_class<T>>( \ ActivationGradImpl<T, Context, funcs::functor_class<T>>( \
dev_ctx, &x, nullptr, &dout, dx, functor); \ dev_ctx, &x, nullptr, &dout, dx, functor); \
} }
#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(name, functor_class) \ #define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(name, functor_class) \
template <typename T, typename Context> \ template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \ void name##GradKernel(const Context& dev_ctx, \
const DenseTensor& out, \ const DenseTensor& out, \
const DenseTensor& dout, \ const DenseTensor& dout, \
DenseTensor* dx) { \ DenseTensor* dx) { \
functor_class<T> functor; \ funcs::functor_class<T> functor; \
ActivationGradImpl<T, Context, functor_class<T>>( \ ActivationGradImpl<T, Context, funcs::functor_class<T>>( \
dev_ctx, nullptr, &out, &dout, dx, functor); \ dev_ctx, nullptr, &out, &dout, dx, functor); \
} }
#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepOut( \ #define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT( \
name, functor_class, attr) \ name, functor_class, attr) \
template <typename T, typename Context> \ template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \ void name##GradKernel(const Context& dev_ctx, \
...@@ -83,39 +83,78 @@ namespace phi { ...@@ -83,39 +83,78 @@ namespace phi {
const DenseTensor& dout, \ const DenseTensor& dout, \
float attr, \ float attr, \
DenseTensor* dx) { \ DenseTensor* dx) { \
functor_class<T> functor; \ funcs::functor_class<T> functor; \
auto attrs = functor.GetAttrs(); \ auto attrs = functor.GetAttrs(); \
*(attrs[0].second) = attr; \ *(attrs[0].second) = attr; \
ActivationGradImpl<T, Context, functor_class<T>>( \ ActivationGradImpl<T, Context, funcs::functor_class<T>>( \
dev_ctx, nullptr, &out, &dout, dx, functor); \ dev_ctx, nullptr, &out, &dout, dx, functor); \
} }
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, funcs::CosGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Cos, CosGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, funcs::TanGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Tan, TanGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, funcs::AcosGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Acos, AcosGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, funcs::SinGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Sin, SinGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, funcs::AsinGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Asin, AsinGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, funcs::AtanGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Atan, AtanGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, funcs::SinhGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Sinh, SinhGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, funcs::CoshGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Cosh, CoshGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, funcs::AsinhGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Asinh, AsinhGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, funcs::AcoshGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Acosh, AcoshGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, funcs::AtanhGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Atanh, AtanhGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink, TanhShrinkGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, funcs::ReluGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, SiluGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(Tanh, funcs::TanhGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu, ReluGradFunctor);
DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(LeakyRelu, DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh, TanhGradFunctor);
funcs::LeakyReluGradFunctor,
DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu,
LeakyReluGradFunctor,
alpha); alpha);
DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX( DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(ThresholdedRelu,
ThresholdedRelu, funcs::ThresholdedReluGradFunctor, threshold); ThresholdedReluGradFunctor,
threshold);
DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX(BRelu, DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink,
funcs::BReluGradFunctor, SoftShrinkGradFunctor,
lambda);
DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink,
HardShrinkGradFunctor,
threshold);
DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu,
BReluGradFunctor,
t_min, t_min,
t_max); t_max);
template <typename T, typename Context>
void EluGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& out,
const DenseTensor& dout,
float alpha,
DenseTensor* dx) {
dev_ctx.template Alloc<T>(dx);
auto x_flatten =
EigenVector<T>::Flatten(GET_DATA_SAFELY(&x, "Input", "X", "elu_grad"));
auto out_flatten = EigenVector<T>::Flatten(
GET_DATA_SAFELY(&out, "Input", "Out", "elu_grad"));
auto dout_flatten = EigenVector<T>::Flatten(
GET_DATA_SAFELY(&dout, "Input", "dOut", "elu_grad"));
auto dx_flatten =
EigenVector<T>::Flatten(GET_DATA_SAFELY(dx, "Output", "dX", "elu_grad"));
auto* place = dev_ctx.eigen_device();
if (alpha > 0) {
funcs::ELUGradFunctor<T> functor;
functor.alpha = alpha;
functor(*place, x_flatten, out_flatten, dout_flatten, dx_flatten);
} else {
funcs::ELUGradNegativeAlphaFunctor<T> functor;
functor.alpha = alpha;
functor(*place, x_flatten, out_flatten, dout_flatten, dx_flatten);
}
}
} // namespace phi } // namespace phi
PD_REGISTER_KERNEL( PD_REGISTER_KERNEL(
...@@ -144,6 +183,11 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(brelu_grad, BReluGradKernel) ...@@ -144,6 +183,11 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(brelu_grad, BReluGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_grad, LeakyReluGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_grad, LeakyReluGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(thresholded_relu_grad, PD_REGISTER_ACTIVATION_GRAD_KERNEL(thresholded_relu_grad,
ThresholdedReluGradKernel) ThresholdedReluGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(soft_shrink_grad, SoftShrinkGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_shrink_grad, HardShrinkGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_shrink_grad, TanhShrinkGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_grad, EluGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(silu_grad, SiluGradKernel)
PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(relu_double_grad, PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(relu_double_grad,
ReluDoubleGradKernel) ReluDoubleGradKernel)
...@@ -151,6 +195,7 @@ PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(tanh_double_grad, ...@@ -151,6 +195,7 @@ PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(tanh_double_grad,
TanhDoubleGradKernel) TanhDoubleGradKernel)
PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(leaky_relu_double_grad, PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(leaky_relu_double_grad,
LeakyReluDoubleGradKernel) LeakyReluDoubleGradKernel)
PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(elu_double_grad, EluDoubleGradKernel)
PD_REGISTER_KERNEL(tanh_triple_grad, PD_REGISTER_KERNEL(tanh_triple_grad,
CPU, CPU,
......
...@@ -23,8 +23,9 @@ namespace phi { ...@@ -23,8 +23,9 @@ namespace phi {
template <typename T, typename Context> \ template <typename T, typename Context> \
void name##Kernel( \ void name##Kernel( \
const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { \ const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { \
functor_class functor; \ funcs::functor_class<T> functor; \
ActivationImpl<T, Context, functor_class>(dev_ctx, x, out, functor); \ ActivationImpl<T, Context, funcs::functor_class<T>>( \
dev_ctx, x, out, functor); \
} }
#define DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(name, functor_class, attr) \ #define DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(name, functor_class, attr) \
...@@ -33,10 +34,11 @@ namespace phi { ...@@ -33,10 +34,11 @@ namespace phi {
const DenseTensor& x, \ const DenseTensor& x, \
float attr, \ float attr, \
DenseTensor* out) { \ DenseTensor* out) { \
functor_class<T> functor; \ funcs::functor_class<T> functor; \
auto attrs = functor.GetAttrs(); \ auto attrs = functor.GetAttrs(); \
*(attrs[0].second) = attr; \ *(attrs[0].second) = attr; \
ActivationImpl<T, Context, functor_class<T>>(dev_ctx, x, out, functor); \ ActivationImpl<T, Context, funcs::functor_class<T>>( \
dev_ctx, x, out, functor); \
} }
#define DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS( \ #define DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS( \
...@@ -47,50 +49,63 @@ namespace phi { ...@@ -47,50 +49,63 @@ namespace phi {
float attr1, \ float attr1, \
float attr2, \ float attr2, \
DenseTensor* out) { \ DenseTensor* out) { \
functor_class<T> functor; \ funcs::functor_class<T> functor; \
auto attrs = functor.GetAttrs(); \ auto attrs = functor.GetAttrs(); \
*(attrs[0].second) = attr1; \ *(attrs[0].second) = attr1; \
*(attrs[1].second) = attr2; \ *(attrs[1].second) = attr2; \
ActivationImpl<T, Context, functor_class<T>>(dev_ctx, x, out, functor); \ ActivationImpl<T, Context, funcs::functor_class<T>>( \
dev_ctx, x, out, functor); \
} }
DEFINE_CPU_ACTIVATION_KERNEL(Sin, funcs::SinFunctor<T>) DEFINE_CPU_ACTIVATION_KERNEL(Sin, SinFunctor)
DEFINE_CPU_ACTIVATION_KERNEL(Cos, funcs::CosFunctor<T>) DEFINE_CPU_ACTIVATION_KERNEL(Cos, CosFunctor)
DEFINE_CPU_ACTIVATION_KERNEL(Tan, funcs::TanFunctor<T>) DEFINE_CPU_ACTIVATION_KERNEL(Tan, TanFunctor)
DEFINE_CPU_ACTIVATION_KERNEL(Asin, funcs::AsinFunctor<T>) DEFINE_CPU_ACTIVATION_KERNEL(Asin, AsinFunctor)
DEFINE_CPU_ACTIVATION_KERNEL(Atan, funcs::AtanFunctor<T>) DEFINE_CPU_ACTIVATION_KERNEL(Atan, AtanFunctor)
DEFINE_CPU_ACTIVATION_KERNEL(Acos, funcs::AcosFunctor<T>) DEFINE_CPU_ACTIVATION_KERNEL(Acos, AcosFunctor)
DEFINE_CPU_ACTIVATION_KERNEL(Sinh, funcs::SinhFunctor<T>) DEFINE_CPU_ACTIVATION_KERNEL(Sinh, SinhFunctor)
DEFINE_CPU_ACTIVATION_KERNEL(Cosh, funcs::CoshFunctor<T>) DEFINE_CPU_ACTIVATION_KERNEL(Cosh, CoshFunctor)
DEFINE_CPU_ACTIVATION_KERNEL(Asinh, funcs::AsinhFunctor<T>) DEFINE_CPU_ACTIVATION_KERNEL(Asinh, AsinhFunctor)
DEFINE_CPU_ACTIVATION_KERNEL(Acosh, funcs::AcoshFunctor<T>) DEFINE_CPU_ACTIVATION_KERNEL(Acosh, AcoshFunctor)
DEFINE_CPU_ACTIVATION_KERNEL(Atanh, funcs::AtanhFunctor<T>) DEFINE_CPU_ACTIVATION_KERNEL(Atanh, AtanhFunctor)
DEFINE_CPU_ACTIVATION_KERNEL(Relu, funcs::ReluCPUFunctor<T>) DEFINE_CPU_ACTIVATION_KERNEL(Relu, ReluCPUFunctor)
DEFINE_CPU_ACTIVATION_KERNEL(Tanh, funcs::TanhFunctor<T>) DEFINE_CPU_ACTIVATION_KERNEL(Tanh, TanhFunctor)
DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, funcs::LeakyReluFunctor, alpha) DEFINE_CPU_ACTIVATION_KERNEL(TanhShrink, TanhShrinkFunctor)
DEFINE_CPU_ACTIVATION_KERNEL(Silu, SiluFunctor)
DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, LeakyReluFunctor, alpha)
DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu, DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu,
funcs::ThresholdedReluFunctor, ThresholdedReluFunctor,
threshold) threshold)
DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, funcs::BReluFunctor, t_min, t_max) DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(HardShrink, HardShrinkFunctor, threshold)
DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(SoftShrink, SoftShrinkFunctor, lambda)
DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Elu, ELUFunctor, alpha)
DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, BReluFunctor, t_min, t_max)
} // namespace phi } // namespace phi
PD_REGISTER_KERNEL(relu, CPU, ALL_LAYOUT, phi::ReluKernel, float, double) {} PD_REGISTER_KERNEL(relu, CPU, ALL_LAYOUT, phi::ReluKernel, float, double) {}
#define PD_REGISTER_ACTIVATION_KERNEL(name, func) \ #define PD_REGISTER_ACTIVATION_KERNEL(name, func) \
PD_REGISTER_KERNEL(name, CPU, ALL_LAYOUT, phi::func##Kernel, float, double) {} PD_REGISTER_KERNEL(name, CPU, ALL_LAYOUT, phi::func, float, double) {}
PD_REGISTER_ACTIVATION_KERNEL(sin, Sin) PD_REGISTER_ACTIVATION_KERNEL(sin, SinKernel)
PD_REGISTER_ACTIVATION_KERNEL(cos, Cos) PD_REGISTER_ACTIVATION_KERNEL(cos, CosKernel)
PD_REGISTER_ACTIVATION_KERNEL(tan, Tan) PD_REGISTER_ACTIVATION_KERNEL(tan, TanKernel)
PD_REGISTER_ACTIVATION_KERNEL(acos, Acos) PD_REGISTER_ACTIVATION_KERNEL(acos, AcosKernel)
PD_REGISTER_ACTIVATION_KERNEL(asin, Asin) PD_REGISTER_ACTIVATION_KERNEL(asin, AsinKernel)
PD_REGISTER_ACTIVATION_KERNEL(atan, Atan) PD_REGISTER_ACTIVATION_KERNEL(atan, AtanKernel)
PD_REGISTER_ACTIVATION_KERNEL(sinh, Sinh) PD_REGISTER_ACTIVATION_KERNEL(sinh, SinhKernel)
PD_REGISTER_ACTIVATION_KERNEL(cosh, Cosh) PD_REGISTER_ACTIVATION_KERNEL(cosh, CoshKernel)
PD_REGISTER_ACTIVATION_KERNEL(asinh, Asinh) PD_REGISTER_ACTIVATION_KERNEL(asinh, AsinhKernel)
PD_REGISTER_ACTIVATION_KERNEL(acosh, Acosh) PD_REGISTER_ACTIVATION_KERNEL(acosh, AcoshKernel)
PD_REGISTER_ACTIVATION_KERNEL(atanh, Atanh) PD_REGISTER_ACTIVATION_KERNEL(atanh, AtanhKernel)
PD_REGISTER_ACTIVATION_KERNEL(tanh, Tanh) PD_REGISTER_ACTIVATION_KERNEL(tanh, TanhKernel)
PD_REGISTER_ACTIVATION_KERNEL(brelu, BRelu) PD_REGISTER_ACTIVATION_KERNEL(brelu, BReluKernel)
PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyRelu) PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel)
PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedRelu) PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedReluKernel)
PD_REGISTER_ACTIVATION_KERNEL(hard_shrink, HardShrinkKernel)
PD_REGISTER_ACTIVATION_KERNEL(soft_shrink, SoftShrinkKernel)
PD_REGISTER_ACTIVATION_KERNEL(tanh_shrink, TanhShrinkKernel)
PD_REGISTER_ACTIVATION_KERNEL(elu, EluKernel)
PD_REGISTER_ACTIVATION_KERNEL(silu, SiluKernel)
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/grid_sample_grad_kernel.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/cpu/grid_sample_utils.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace phi {
template <typename T>
static inline void ClipWithMask(const CPUContext& ctx,
const int max_val, // height-1 or width-1
bool align_corners,
std::string padding_mode,
DenseTensor* grid_slice,
DenseTensor* grid_scale) {
auto& place = *ctx.eigen_device();
grid_scale->Resize(grid_slice->dims());
ctx.Alloc<T>(grid_scale);
auto grid_slice_t = EigenTensor<T, 3>::From(*grid_slice);
auto factor = static_cast<T>(max_val * 0.5);
if (!align_corners) {
factor = static_cast<T>((max_val + 1) * 0.5);
}
auto grid_scale_t = EigenTensor<T, 3>::From(*grid_scale).setConstant(factor);
if (padding_mode == "border") {
// auto bounded_lo = grid_slice_t.cwiseMax(static_cast<T>(0));
auto res = grid_slice_t.cwiseMax(static_cast<T>(0))
.cwiseMin(static_cast<T>(max_val));
auto in_bound = (res == grid_slice_t);
grid_scale_t.device(place) = grid_scale_t * in_bound.template cast<T>();
grid_slice_t.device(place) = res;
} else if (padding_mode == "reflection") {
if (align_corners) {
auto double_range = static_cast<T>(max_val * 2);
auto is_neg = (grid_slice_t < static_cast<T>(0));
auto grid_abs = grid_slice_t.abs();
auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
auto one_more_flip = (extra > (double_range - extra));
grid_scale_t.device(place) =
grid_scale_t * ((is_neg == one_more_flip).template cast<T>() -
(is_neg != one_more_flip).template cast<T>());
grid_slice_t.device(place) = extra.cwiseMin(double_range - extra);
if (max_val == 0) {
grid_slice_t.device(place) = grid_slice_t.constant(static_cast<T>(0));
}
} else {
auto double_range = static_cast<T>((max_val + 1) * 2);
auto grid_abs = (grid_slice_t + static_cast<T>(0.5)).abs();
auto is_neg = ((grid_slice_t + static_cast<T>(0.5)) < static_cast<T>(0));
auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
auto one_more_flip = (extra > (double_range - extra));
auto reflected =
extra.cwiseMin(double_range - extra) - static_cast<T>(0.5);
auto clipped = reflected.cwiseMax(static_cast<T>(0))
.cwiseMin(static_cast<T>(max_val));
auto in_bound = (clipped == reflected).template cast<T>();
grid_scale_t.device(place) =
grid_scale_t * ((is_neg == one_more_flip).template cast<T>() -
(is_neg != one_more_flip).template cast<T>()) *
in_bound;
grid_slice_t.device(place) = clipped;
}
}
}
template <typename T>
static void CalcGridLocationsWithGrad(const CPUContext& ctx,
const DenseTensor& grid,
const int in_h,
const int in_w,
bool align_corners,
std::string padding_mode,
DenseTensor* grid_x,
DenseTensor* grid_y,
DenseTensor* grid_x_scale,
DenseTensor* grid_y_scale) {
const int n = grid.dims()[0];
const int out_h = grid.dims()[1];
const int out_w = grid.dims()[2];
// split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim
grid_x->Resize({n, out_h, out_w});
grid_y->Resize({n, out_h, out_w});
T* grid_x_data = ctx.Alloc<T>(grid_x);
T* grid_y_data = ctx.Alloc<T>(grid_y);
const T* grid_data = grid.data<T>();
for (int i = 0; i < n * out_h * out_w; i++) {
grid_x_data[i] = grid_data[2 * i];
grid_y_data[i] = grid_data[(2 * i) + 1];
}
Unnormalize<T>(ctx, grid_x, in_w - 1, align_corners);
Unnormalize<T>(ctx, grid_y, in_h - 1, align_corners);
ClipWithMask<T>(
ctx, in_w - 1, align_corners, padding_mode, grid_x, grid_x_scale);
ClipWithMask<T>(
ctx, in_h - 1, align_corners, padding_mode, grid_y, grid_y_scale);
}
template <typename T>
static void GatherOutputGradToInputGrad(const DenseTensor& output_grad,
DenseTensor* input_grad,
const DenseTensor& x,
const DenseTensor& y,
const DenseTensor& d1,
const DenseTensor& d2) {
const int n = output_grad.dims()[0];
const int c = output_grad.dims()[1];
const int out_h = output_grad.dims()[2];
const int out_w = output_grad.dims()[3];
const int in_h = input_grad->dims()[2];
const int in_w = input_grad->dims()[3];
auto x_t = EigenTensor<T, 3>::From(x);
auto y_t = EigenTensor<T, 3>::From(y);
auto d1_t = EigenTensor<T, 3>::From(d1);
auto d2_t = EigenTensor<T, 3>::From(d2);
auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
for (int i = 0; i < n; i++) {
for (int k = 0; k < out_h; k++) {
for (int l = 0; l < out_w; l++) {
if (IsInBound(
x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1), (T)(in_h - 1))) {
for (int j = 0; j < c; j++) {
input_grad_t(i,
j,
static_cast<int>(round(y_t(i, k, l))),
static_cast<int>(round(x_t(i, k, l)))) +=
output_grad_t(i, j, k, l) * d1_t(i, k, l) * d2_t(i, k, l);
}
}
}
}
}
}
template <typename T>
static void GatherBilinearGrad(const CPUContext& ctx,
const DenseTensor& input,
const DenseTensor& output_grad,
DenseTensor* grid_x,
DenseTensor* grid_y,
DenseTensor* grid_x_scale,
DenseTensor* grid_y_scale,
DenseTensor* input_grad,
DenseTensor* grid_grad) {
const int n = grid_x->dims()[0];
const int out_h = grid_x->dims()[1];
const int out_w = grid_x->dims()[2];
const int c = input.dims()[1];
DenseTensor x_w, x_e, y_n, y_s;
DenseTensor d_w, d_e, d_n, d_s;
DenseTensor v_wn, v_en, v_ws, v_es;
AllNeigbors<T>(ctx,
input,
grid_x, // grid_x
grid_y, // grid_y
&x_w,
&x_e,
&y_n,
&y_s,
&d_w,
&d_e,
&d_n,
&d_s,
&v_wn,
&v_en,
&v_ws,
&v_es);
// gather output grad value to input grad by corner point coords and weight
GatherOutputGradToInputGrad<T>(output_grad, input_grad, x_w, y_n, d_e, d_s);
GatherOutputGradToInputGrad<T>(output_grad, input_grad, x_w, y_s, d_e, d_n);
GatherOutputGradToInputGrad<T>(output_grad, input_grad, x_e, y_n, d_w, d_s);
GatherOutputGradToInputGrad<T>(output_grad, input_grad, x_e, y_s, d_w, d_n);
auto v_wn_t = EigenTensor<T, 4>::From(v_wn);
auto v_en_t = EigenTensor<T, 4>::From(v_en);
auto v_ws_t = EigenTensor<T, 4>::From(v_ws);
auto v_es_t = EigenTensor<T, 4>::From(v_es);
auto d_w_t = EigenTensor<T, 3>::From(d_w);
auto d_e_t = EigenTensor<T, 3>::From(d_e);
auto d_n_t = EigenTensor<T, 3>::From(d_n);
auto d_s_t = EigenTensor<T, 3>::From(d_s);
auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
if (grid_grad != nullptr) {
DenseTensor grid_grad_x, grid_grad_y;
grid_grad_x.Resize({n, out_h, out_w});
grid_grad_y.Resize({n, out_h, out_w});
ctx.Alloc<T>(&grid_grad_x);
ctx.Alloc<T>(&grid_grad_y);
auto grid_grad_x_t =
EigenTensor<T, 3>::From(grid_grad_x).setConstant(static_cast<T>(0.0));
auto grid_grad_y_t =
EigenTensor<T, 3>::From(grid_grad_y).setConstant(static_cast<T>(0.0));
for (int i = 0; i < n; i++) {
for (int j = 0; j < c; j++) {
for (int k = 0; k < out_h; k++) {
for (int l = 0; l < out_w; l++) {
grid_grad_x_t(i, k, l) +=
((v_en_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_s_t(i, k, l) +
(v_es_t(i, j, k, l) - v_ws_t(i, j, k, l)) * d_n_t(i, k, l)) *
output_grad_t(i, j, k, l);
grid_grad_y_t(i, k, l) +=
((v_ws_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_e_t(i, k, l) +
(v_es_t(i, j, k, l) - v_en_t(i, j, k, l)) * d_w_t(i, k, l)) *
output_grad_t(i, j, k, l);
}
}
}
}
// const T x_max = static_cast<T>(in_w - 1);
// const T y_max = static_cast<T>(in_h - 1);
auto grid_x_scale_t = EigenTensor<T, 3>::From(*grid_x_scale);
auto grid_y_scale_t = EigenTensor<T, 3>::From(*grid_y_scale);
grid_grad_x_t = grid_grad_x_t * grid_x_scale_t;
grid_grad_y_t = grid_grad_y_t * grid_y_scale_t;
// gather grid_grad [x, y] in 3rd Dim
T* grid_grad_data = grid_grad->data<T>();
T* grid_grad_x_data = grid_grad_x.data<T>();
T* grid_grad_y_data = grid_grad_y.data<T>();
for (int i = 0; i < n * out_h * out_w; i++) {
grid_grad_data[2 * i] = grid_grad_x_data[i];
grid_grad_data[2 * i + 1] = grid_grad_y_data[i];
}
}
}
template <typename T>
static void GatherOutputGradToInputGrad(const DenseTensor& output_grad,
DenseTensor* input_grad,
const DenseTensor& x,
const DenseTensor& y) {
const int n = output_grad.dims()[0];
const int c = output_grad.dims()[1];
const int out_h = output_grad.dims()[2];
const int out_w = output_grad.dims()[3];
const int in_h = input_grad->dims()[2];
const int in_w = input_grad->dims()[3];
auto x_t = EigenTensor<T, 3>::From(x);
auto y_t = EigenTensor<T, 3>::From(y);
auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
for (int i = 0; i < n; i++) {
for (int k = 0; k < out_h; k++) {
for (int l = 0; l < out_w; l++) {
if (IsInBound(
x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1), (T)(in_h - 1))) {
for (int j = 0; j < c; j++) {
input_grad_t(i,
j,
static_cast<int>(round(y_t(i, k, l))),
static_cast<int>(round(x_t(i, k, l)))) +=
output_grad_t(i, j, k, l);
}
}
}
}
}
}
template <typename T, typename Context>
void GridSampleGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& grid,
const DenseTensor& out_grid,
const std::string& mode,
const std::string& padding_mode,
bool align_corners,
DenseTensor* x_grad,
DenseTensor* grid_grad) {
const int n = grid.dims()[0];
const int out_h = grid.dims()[1];
const int out_w = grid.dims()[2];
const int c = x.dims()[1];
const int in_h = x.dims()[2];
const int in_w = x.dims()[3];
x_grad->Resize({n, c, in_h, in_w});
dev_ctx.template Alloc<T>(x_grad);
phi::funcs::SetConstant<Context, T>()(dev_ctx, x_grad, static_cast<T>(0));
if (grid_grad != nullptr) {
grid_grad->Resize({n, out_h, out_w, 2});
dev_ctx.template Alloc<T>(grid_grad);
phi::funcs::SetConstant<Context, T>()(
dev_ctx, grid_grad, static_cast<T>(0));
}
DenseTensor grid_x, grid_y;
DenseTensor grid_x_scale, grid_y_scale;
CalcGridLocationsWithGrad<T>(dev_ctx,
grid,
in_h,
in_w,
align_corners,
padding_mode,
&grid_x,
&grid_y,
&grid_x_scale,
&grid_y_scale);
if (mode == "bilinear") {
GatherBilinearGrad<T>(dev_ctx,
x,
out_grid,
&grid_x,
&grid_y,
&grid_x_scale,
&grid_y_scale,
x_grad,
grid_grad);
} else {
auto grid_x_t = EigenTensor<T, 3>::From(grid_x);
auto grid_y_t = EigenTensor<T, 3>::From(grid_y);
grid_x_t = grid_x_t.round();
grid_y_t = grid_y_t.round();
GatherOutputGradToInputGrad<T>(out_grid, x_grad, grid_x, grid_y);
}
}
} // namespace phi
PD_REGISTER_KERNEL(grid_sample_grad,
CPU,
ALL_LAYOUT,
phi::GridSampleGradKernel,
float,
double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/grid_sample_kernel.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/cpu/grid_sample_utils.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace phi {
using Array4 = Eigen::DSizes<int64_t, 4>;
template <typename T>
static inline void Clip(const CPUContext& ctx,
DenseTensor* grid_slice,
const int max_val, // height-1 or width-1
bool align_corners,
std::string padding_mode) {
auto& place = *ctx.eigen_device();
auto grid_slice_t = EigenTensor<T, 3>::From(*grid_slice);
if (padding_mode == "border") {
grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast<T>(0))
.cwiseMin(static_cast<T>(max_val));
} else if (padding_mode == "reflection") {
if (align_corners) {
auto double_range = static_cast<T>(max_val * 2);
auto grid_abs = grid_slice_t.abs();
auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
grid_slice_t.device(place) = extra.cwiseMin(double_range - extra);
if (max_val == 0) {
grid_slice_t.device(place) = grid_slice_t.constant(static_cast<T>(0));
}
} else {
auto double_range = static_cast<T>((max_val + 1) * 2);
auto grid_abs = (grid_slice_t + static_cast<T>(0.5)).abs();
auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
grid_slice_t.device(place) =
extra.cwiseMin(double_range - extra) - static_cast<T>(0.5);
grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast<T>(0))
.cwiseMin(static_cast<T>(max_val));
}
}
}
template <typename T>
static void CalcGridLocations(const CPUContext& ctx,
const DenseTensor& grid,
const int in_h,
const int in_w,
bool align_corners,
std::string padding_mode,
DenseTensor* grid_x,
DenseTensor* grid_y) {
const int n = grid.dims()[0];
const int out_h = grid.dims()[1];
const int out_w = grid.dims()[2];
// split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim
grid_x->Resize({n, out_h, out_w});
grid_y->Resize({n, out_h, out_w});
T* grid_x_data = ctx.Alloc<T>(grid_x);
T* grid_y_data = ctx.Alloc<T>(grid_y);
const T* grid_data = grid.data<T>();
for (int i = 0; i < n * out_h * out_w; i++) {
grid_x_data[i] = grid_data[2 * i];
grid_y_data[i] = grid_data[(2 * i) + 1];
}
Unnormalize<T>(ctx, grid_x, in_w - 1, align_corners);
Unnormalize<T>(ctx, grid_y, in_h - 1, align_corners);
Clip<T>(ctx, grid_x, in_w - 1, align_corners, padding_mode);
Clip<T>(ctx, grid_y, in_h - 1, align_corners, padding_mode);
}
template <typename T>
static void BilinearInter(const CPUContext& ctx,
const DenseTensor& input,
DenseTensor* grid_x,
DenseTensor* grid_y,
DenseTensor* out) {
auto& place = *ctx.eigen_device();
const int n = grid_x->dims()[0];
const int out_h = grid_x->dims()[1];
const int out_w = grid_x->dims()[2];
const int c = input.dims()[1];
DenseTensor x_w, x_e, y_n, y_s;
DenseTensor d_w, d_e, d_n, d_s;
DenseTensor v_wn, v_en, v_ws, v_es;
AllNeigbors<T>(ctx,
input,
grid_x,
grid_y,
&x_w,
&x_e,
&y_n,
&y_s,
&d_w,
&d_e,
&d_n,
&d_s,
&v_wn,
&v_en,
&v_ws,
&v_es);
auto d_w_t = EigenTensor<T, 3>::From(d_w);
auto d_e_t = EigenTensor<T, 3>::From(d_e);
auto d_n_t = EigenTensor<T, 3>::From(d_n);
auto d_s_t = EigenTensor<T, 3>::From(d_s);
auto d_w_scaled_t =
d_w_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
auto d_e_scaled_t =
d_e_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
auto d_n_scaled_t =
d_n_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
auto d_s_scaled_t =
d_s_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
auto v_wn_t = EigenTensor<T, 4>::From(v_wn);
auto v_en_t = EigenTensor<T, 4>::From(v_en);
auto v_ws_t = EigenTensor<T, 4>::From(v_ws);
auto v_es_t = EigenTensor<T, 4>::From(v_es);
auto output_t = EigenTensor<T, 4>::From(*out);
// bilinear interpolaetion by 4 corner points
output_t.device(place) = v_wn_t * d_e_scaled_t * d_s_scaled_t +
v_en_t * d_w_scaled_t * d_s_scaled_t +
v_ws_t * d_e_scaled_t * d_n_scaled_t +
v_es_t * d_w_scaled_t * d_n_scaled_t;
}
template <typename T, typename Context>
void GridSampleKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& grid,
const std::string& mode,
const std::string& padding_mode,
bool align_corners,
DenseTensor* out) {
const int n = grid.dims()[0];
const int out_h = grid.dims()[1];
const int out_w = grid.dims()[2];
const int c = x.dims()[1];
const int in_h = x.dims()[2];
const int in_w = x.dims()[3];
out->Resize(phi::make_ddim({n, c, out_h, out_w}));
dev_ctx.template Alloc<T>(out);
phi::funcs::SetConstant<Context, T>()(dev_ctx, out, static_cast<T>(0));
DenseTensor grid_x, grid_y;
CalcGridLocations<T>(
dev_ctx, grid, in_h, in_w, align_corners, padding_mode, &grid_x, &grid_y);
if (mode == "bilinear") {
BilinearInter<T>(dev_ctx, x, &grid_x, &grid_y, out);
} else if (mode == "nearest") {
auto grid_x_t = EigenTensor<T, 3>::From(grid_x);
auto grid_y_t = EigenTensor<T, 3>::From(grid_y);
grid_x_t = grid_x_t.round();
grid_y_t = grid_y_t.round();
GetGridPointValue<T>(x, out, grid_x, grid_y);
}
}
} // namespace phi
PD_REGISTER_KERNEL(
grid_sample, CPU, ALL_LAYOUT, phi::GridSampleKernel, float, double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
namespace phi {
template <typename T>
void Unnormalize(const CPUContext& ctx,
DenseTensor* grid_slice,
const int max_val, // height-1 or width-1
bool align_corners) {
auto& place = *ctx.eigen_device();
auto grid_slice_t = EigenTensor<T, 3>::From(*grid_slice);
if (!align_corners) {
auto factor = static_cast<T>((max_val + 1) * 0.5);
grid_slice_t.device(place) =
(grid_slice_t + static_cast<T>(1)) * factor - static_cast<T>(0.5);
} else {
auto factor = static_cast<T>(max_val * 0.5);
grid_slice_t.device(place) = (grid_slice_t + static_cast<T>(1)) * factor;
}
}
template <typename T>
inline bool IsInBound(T x, T y, T x_max, T y_max) {
if (x < 0 || x > x_max || y < 0 || y > y_max) {
return false;
}
return true;
}
template <typename T>
void GetGridPointValue(const DenseTensor& input,
DenseTensor* output,
const DenseTensor& x,
const DenseTensor& y) {
const int n = input.dims()[0];
const int c = input.dims()[1];
const int in_h = input.dims()[2];
const int in_w = input.dims()[3];
const int out_h = x.dims()[1];
const int out_w = x.dims()[2];
auto x_t = EigenTensor<T, 3>::From(x);
auto y_t = EigenTensor<T, 3>::From(y);
auto output_t = EigenTensor<T, 4>::From(*output).setConstant((T)0);
auto input_t = EigenTensor<T, 4>::From(input);
for (int i = 0; i < n; i++) {
for (int k = 0; k < out_h; k++) {
for (int l = 0; l < out_w; l++) {
if (IsInBound(
x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1), (T)(in_h - 1))) {
for (int j = 0; j < c; j++) {
output_t(i, j, k, l) =
input_t(i,
j,
static_cast<int>(round(y_t(i, k, l))),
static_cast<int>(round(x_t(i, k, l))));
}
}
}
}
}
}
template <typename T>
void AllNeigbors(const CPUContext& ctx,
const DenseTensor& input,
DenseTensor* grid_x,
DenseTensor* grid_y,
DenseTensor* x_w,
DenseTensor* x_e,
DenseTensor* y_n,
DenseTensor* y_s, // positions
DenseTensor* d_w,
DenseTensor* d_e,
DenseTensor* d_n,
DenseTensor* d_s, // distance
DenseTensor* v_wn,
DenseTensor* v_en,
DenseTensor* v_ws,
DenseTensor* v_es) { // values
auto& place = *ctx.eigen_device();
const int c = input.dims()[1];
const int n = grid_x->dims()[0];
const int out_h = grid_x->dims()[1];
const int out_w = grid_x->dims()[2];
// calculate coords of 4 corner points
x_w->Resize({n, out_h, out_w});
x_e->Resize({n, out_h, out_w});
y_n->Resize({n, out_h, out_w});
y_s->Resize({n, out_h, out_w});
ctx.Alloc<T>(x_w);
ctx.Alloc<T>(x_e);
ctx.Alloc<T>(y_n);
ctx.Alloc<T>(y_s);
auto x_w_t = EigenTensor<T, 3>::From(*x_w);
auto x_e_t = EigenTensor<T, 3>::From(*x_e);
auto y_n_t = EigenTensor<T, 3>::From(*y_n);
auto y_s_t = EigenTensor<T, 3>::From(*y_s);
auto grid_x_t = EigenTensor<T, 3>::From(*grid_x);
auto grid_y_t = EigenTensor<T, 3>::From(*grid_y);
x_w_t.device(place) = grid_x_t.floor();
x_e_t.device(place) = x_w_t + static_cast<T>(1);
y_n_t.device(place) = grid_y_t.floor();
y_s_t.device(place) = y_n_t + static_cast<T>(1);
// calculate distances to 4 sides
d_w->Resize({n, out_h, out_w});
d_e->Resize({n, out_h, out_w});
d_n->Resize({n, out_h, out_w});
d_s->Resize({n, out_h, out_w});
ctx.Alloc<T>(d_w);
ctx.Alloc<T>(d_e);
ctx.Alloc<T>(d_n);
ctx.Alloc<T>(d_s);
auto d_w_t = EigenTensor<T, 3>::From(*d_w);
auto d_e_t = EigenTensor<T, 3>::From(*d_e);
auto d_n_t = EigenTensor<T, 3>::From(*d_n);
auto d_s_t = EigenTensor<T, 3>::From(*d_s);
d_w_t.device(place) = grid_x_t - x_w_t;
d_e_t.device(place) = x_e_t - grid_x_t;
d_n_t.device(place) = grid_y_t - y_n_t;
d_s_t.device(place) = y_s_t - grid_y_t;
// calc 4 corner points value
v_wn->Resize({n, c, out_h, out_w});
v_en->Resize({n, c, out_h, out_w});
v_ws->Resize({n, c, out_h, out_w});
v_es->Resize({n, c, out_h, out_w});
ctx.Alloc<T>(v_wn);
ctx.Alloc<T>(v_en);
ctx.Alloc<T>(v_ws);
ctx.Alloc<T>(v_es);
GetGridPointValue<T>(input, v_wn, *x_w, *y_n);
GetGridPointValue<T>(input, v_en, *x_e, *y_n);
GetGridPointValue<T>(input, v_ws, *x_w, *y_s);
GetGridPointValue<T>(input, v_es, *x_e, *y_s);
}
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/index_select_grad_kernel.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/utils/data_type.h"
#include "paddle/phi/kernels/cpu/index_select_impl.h"
namespace phi {
template <typename T, typename Context>
void IndexSelectGradKernel(const Context& ctx,
const DenseTensor& x,
const DenseTensor& index,
const DenseTensor& out_grad,
int dim,
DenseTensor* x_grad) {
if (dim < 0) {
dim += out_grad.dims().size();
}
const auto& index_type = index.dtype();
bool index_type_match =
index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64;
PADDLE_ENFORCE_EQ(index_type_match,
true,
phi::errors::InvalidArgument(
"Input(Index) holds the wrong type, it holds %s, but "
"desires to be %s or %s",
index_type,
phi::DataType::INT32,
phi::DataType::INT64));
if (index_type == phi::DataType::INT32) {
IndexSelectGradInner<Context, T, int>(ctx, out_grad, index, x_grad, dim);
} else if (index_type == phi::DataType::INT64) {
IndexSelectGradInner<Context, T, int64_t>(
ctx, out_grad, index, x_grad, dim);
}
}
} // namespace phi
PD_REGISTER_KERNEL(index_select_grad,
CPU,
ALL_LAYOUT,
phi::IndexSelectGradKernel,
float,
double,
int,
int64_t) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace phi {
template <typename Context, typename T, class Enable = void>
struct IndexSelectAdd {
void operator()(const Context& ctx,
int slice_size,
const T* src_pointer,
const T* p_pointer,
T* dist_pointer) {
for (int i = 0; i < slice_size; i++) {
dist_pointer[i] = src_pointer[i] + p_pointer[i];
}
}
};
template <typename Context, typename T>
struct IndexSelectAdd<
Context,
T,
typename std::enable_if<std::is_floating_point<T>::value>::type> {
void operator()(const Context& ctx,
int slice_size,
const T* src_pointer,
const T* p_pointer,
T* dist_pointer) {
auto blas = phi::funcs::GetBlas<Context, T>(ctx);
blas.VADD(slice_size, src_pointer, p_pointer, dist_pointer);
}
};
template <typename Context, typename T, typename IndexT = int>
void IndexSelectInner(const Context& ctx,
DenseTensor* input,
const DenseTensor& index,
DenseTensor* output,
int dim) {
auto input_dim = input->dims();
auto input_dim_size = input_dim.size();
auto output_dim = output->dims();
auto index_size = index.dims()[0];
DenseTensor index_cpu_copy;
if (!paddle::platform::is_cpu_place(index.place())) {
phi::Copy(ctx, index, phi::CPUPlace(), true, &index_cpu_copy);
}
const IndexT* index_data = paddle::platform::is_cpu_place(index.place())
? index.data<IndexT>()
: index_cpu_copy.data<IndexT>();
ctx.template Alloc<T>(output);
auto slice_size = 1;
for (auto i = dim + 1; i < input_dim_size; i++) {
slice_size *= input_dim[i];
}
auto outer_nums = 1;
for (auto i = 0; i < dim; i++) {
outer_nums *= input_dim[i];
}
for (int i = 0; i < index_size; i++) {
PADDLE_ENFORCE_GE(
index_data[i],
0,
phi::errors::InvalidArgument(
"Variable value (index) of OP(index_select) "
"expected >= 0 and < %ld, but got %ld. Please check input "
"value.",
input_dim[dim],
index_data[i]));
PADDLE_ENFORCE_LT(
index_data[i],
input_dim[dim],
phi::errors::InvalidArgument(
"Variable value (index) of OP(index_select) "
"expected >= 0 and < %ld, but got %ld. Please check input "
"value.",
input_dim[dim],
index_data[i]));
}
VLOG(3) << "Index_Select_Debug; outer_nums: " << outer_nums
<< "; slice_size: " << slice_size << "; index_size: " << index_size;
input->Resize(phi::make_ddim({outer_nums, input_dim[dim], slice_size}));
output->Resize(phi::make_ddim({outer_nums, index_size, slice_size}));
auto input_tensor = EigenTensor<T, 3>::From(*input);
auto output_tensor = EigenTensor<T, 3>::From(*output);
auto& place = *ctx.eigen_device();
for (auto j = 0; j < index_size; j++) {
IndexT index_value = index_data[j];
auto output_t = output_tensor.chip(j, 1);
output_t.device(place) = input_tensor.chip(index_value, 1);
}
input->Resize(input_dim);
output->Resize(output_dim);
}
template <typename Context, typename T, typename IndexT = int>
void IndexSelectGradInner(const Context& ctx,
const DenseTensor& out_grad,
const DenseTensor& index,
DenseTensor* x_grad,
int dim) {
const T* input_data = out_grad.data<T>();
const IndexT* index_data = index.data<IndexT>();
const T* p_output = ctx.template Alloc<T>(x_grad);
T* out_data = ctx.template Alloc<T>(x_grad);
auto input_dim = out_grad.dims();
auto input_dim_size = input_dim.size();
auto output_dim = x_grad->dims();
phi::funcs::SetConstant<Context, T> set_constant;
set_constant(ctx, x_grad, static_cast<T>(0.0));
auto slice_size = 1;
for (auto i = dim + 1; i < input_dim_size; i++) {
slice_size *= input_dim[i];
}
auto input_width = slice_size * input_dim[dim];
auto output_width = slice_size * output_dim[dim];
auto outer_nums = 1;
for (auto i = 0; i < dim; i++) {
outer_nums *= input_dim[i];
}
auto index_size = index.dims()[0];
VLOG(3) << "Index_Select_Grad_Debug; outer_nums: " << outer_nums
<< "; slice_size: " << slice_size << "; input_width: " << input_width
<< "; output_width: " << output_width
<< "; index_size: " << index_size;
for (auto i = 0; i < outer_nums; i++) {
auto input_start_offset = i * input_width;
auto output_start_offset = i * output_width;
for (auto j = 0; j < index_size; j++) {
IndexT index_value = index_data[j];
auto src = input_data + input_start_offset + j * slice_size;
auto p_out = p_output + output_start_offset + index_value * slice_size;
auto dst = out_data + output_start_offset + index_value * slice_size;
IndexSelectAdd<Context, T> index_select_add;
index_select_add(ctx, slice_size, src, p_out, dst);
}
}
x_grad->Resize(output_dim);
}
} // namespace phi
...@@ -12,32 +12,50 @@ ...@@ -12,32 +12,50 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/phi/kernels/reduce_prod_kernel.h" #include "paddle/phi/kernels/index_select_kernel.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/cpu/reduce.h" #include "paddle/phi/core/utils/data_type.h"
#include "paddle/phi/kernels/funcs/reduce_functor.h" #include "paddle/phi/kernels/cpu/index_select_impl.h"
namespace phi { namespace phi {
template <typename T, typename Context> template <typename T, typename Context>
void ReduceProdKernel(const Context& dev_ctx, void IndexSelectKernel(const Context& ctx,
const DenseTensor& x, const DenseTensor& x,
const std::vector<int64_t>& dims, const DenseTensor& index,
bool keep_dim, int dim,
bool reduce_all, DenseTensor* output) {
DenseTensor* out) { auto inputs = x;
auto out_dtype = x.dtype(); if (dim < 0) {
phi::Reduce<CPUContext, T, phi::funcs::ProdFunctor>( dim += inputs.dims().size();
dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); }
const auto& index_type = index.dtype();
bool index_type_match =
index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64;
PADDLE_ENFORCE_EQ(index_type_match,
true,
phi::errors::InvalidArgument(
"Input(Index) holds the wrong type, it holds %s, but "
"desires to be %s or %s",
index_type,
phi::DataType::INT32,
phi::DataType::INT64));
if (index_type == phi::DataType::INT32) {
IndexSelectInner<Context, T, int>(ctx, &inputs, index, output, dim);
} else if (index_type == phi::DataType::INT64) {
IndexSelectInner<Context, T, int64_t>(ctx, &inputs, index, output, dim);
}
} }
} // namespace phi } // namespace phi
PD_REGISTER_KERNEL(reduce_prod, PD_REGISTER_KERNEL(index_select,
CPU, CPU,
ALL_LAYOUT, ALL_LAYOUT,
phi::ReduceProdKernel, phi::IndexSelectKernel,
float, float,
double, double,
int, int,
......
...@@ -13,6 +13,8 @@ ...@@ -13,6 +13,8 @@
// limitations under the License. // limitations under the License.
#include "paddle/phi/kernels/lgamma_kernel.h" #include "paddle/phi/kernels/lgamma_kernel.h"
#include <unsupported/Eigen/SpecialFunctions>
#include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/for_range.h" #include "paddle/phi/kernels/funcs/for_range.h"
......
...@@ -19,10 +19,8 @@ ...@@ -19,10 +19,8 @@
#include "paddle/phi/common/scalar.h" #include "paddle/phi/common/scalar.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/cpu/elementwise.h" #include "paddle/phi/kernels/cpu/elementwise.h"
#include "paddle/phi/kernels/cpu/reduce.h"
#include "paddle/phi/kernels/funcs/elementwise_base.h" #include "paddle/phi/kernels/funcs/elementwise_base.h"
#include "paddle/phi/kernels/funcs/elementwise_functor.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h"
#include "paddle/phi/kernels/funcs/reduce_functor.h"
// See Note [ Why still include the fluid headers? ] // See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
...@@ -55,30 +53,6 @@ namespace phi { ...@@ -55,30 +53,6 @@ namespace phi {
} \ } \
} }
template <typename T, typename Context>
void MeanRawKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
bool reduce_all,
DenseTensor* out) {
auto out_dtype = x.dtype();
phi::Reduce<CPUContext, T, phi::funcs::MeanFunctor>(
dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
}
template <typename T, typename Context>
void SumRawKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
bool reduce_all,
DataType out_dtype,
DenseTensor* out) {
phi::Reduce<CPUContext, T, phi::funcs::SumFunctor>(
dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
}
template <typename T, typename Context> template <typename T, typename Context>
void DivideRawKernel(const Context& dev_ctx, void DivideRawKernel(const Context& dev_ctx,
const DenseTensor& x, const DenseTensor& x,
...@@ -164,20 +138,3 @@ PD_REGISTER_KERNEL(multiply_raw, ...@@ -164,20 +138,3 @@ PD_REGISTER_KERNEL(multiply_raw,
complex64, complex64,
complex128, complex128,
phi::dtype::bfloat16) {} phi::dtype::bfloat16) {}
PD_REGISTER_KERNEL(sum_raw,
CPU,
ALL_LAYOUT,
phi::SumRawKernel,
bool,
float,
double,
phi::dtype::float16,
int16_t,
int,
int64_t,
complex64,
complex128) {
kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
}
PD_REGISTER_KERNEL(
mean_raw, CPU, ALL_LAYOUT, phi::MeanRawKernel, float, double, bool) {}
...@@ -23,7 +23,7 @@ ...@@ -23,7 +23,7 @@
#include "paddle/phi/kernels/funcs/elementwise_base.h" #include "paddle/phi/kernels/funcs/elementwise_base.h"
#include "paddle/phi/kernels/impl/matrix_rank_kernel_impl.h" #include "paddle/phi/kernels/impl/matrix_rank_kernel_impl.h"
#include "paddle/phi/kernels/math_kernel.h" #include "paddle/phi/kernels/math_kernel.h"
#include "paddle/phi/kernels/reduce_max_kernel.h" #include "paddle/phi/kernels/reduce_kernel.h"
namespace phi { namespace phi {
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/multiplex_grad_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
namespace phi {
template <typename T, typename Context>
void MultiplexGradKernel(const Context& ctx,
const DenseTensor& ids,
const DenseTensor& out_grad,
std::vector<DenseTensor*> ins_grad) {
size_t idx = -1UL;
for (size_t i = 0; i < ins_grad.size(); i++) {
if (ins_grad[i]) {
ctx.template Alloc<T>(ins_grad[i]);
auto t = phi::EigenVector<T>::Flatten(*ins_grad[i]);
t.device(*ctx.eigen_device()) = t.constant(static_cast<T>(0));
idx = i;
}
}
if (idx == -1UL) return;
auto rows = ins_grad[idx]->dims()[0];
auto cols = ins_grad[idx]->numel() / rows;
auto* index = ids.data<int32_t>();
for (auto i = 0; i < rows; i++) {
size_t k = static_cast<size_t>(index[i]);
if (ins_grad[k]) {
paddle::memory::Copy(ctx.GetPlace(),
ins_grad[k]->data<T>() + i * cols,
ctx.GetPlace(),
out_grad.data<T>() + i * cols,
cols * sizeof(T));
}
}
}
} // namespace phi
PD_REGISTER_KERNEL(multiplex_grad,
CPU,
ALL_LAYOUT,
phi::MultiplexGradKernel,
float,
double,
int,
int64_t) {}
...@@ -12,28 +12,54 @@ ...@@ -12,28 +12,54 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/phi/kernels/reduce_min_kernel.h" #include "paddle/phi/kernels/multiplex_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/cpu/reduce.h"
#include "paddle/phi/kernels/funcs/reduce_functor.h"
namespace phi { namespace phi {
template <typename T, typename Context> template <typename T, typename Context>
void MinRawKernel(const Context& dev_ctx, void MultiplexKernel(const Context& ctx,
const DenseTensor& x, const std::vector<const DenseTensor*>& ins,
const std::vector<int64_t>& dims, const DenseTensor& ids,
bool keep_dim,
bool reduce_all,
DenseTensor* out) { DenseTensor* out) {
auto out_dtype = x.dtype(); ctx.template Alloc<T>(out);
phi::Reduce<CPUContext, T, phi::funcs::MinFunctor>( for (size_t i = 0; i < ins.size(); ++i) {
dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); PADDLE_ENFORCE_GT(
ins[i]->numel(),
0,
errors::OutOfRange(
"indexing will be out of bounds with size 0 for the %d-th input.",
i));
}
auto rows = ins[0]->dims()[0];
auto cols = ins[0]->numel() / rows;
auto index = ids.data<int32_t>();
for (auto i = 0; i < rows; i++) {
int32_t k = index[i];
PADDLE_ENFORCE_GE(
k, 0, errors::PreconditionNotMet("index must be nonnegative."));
PADDLE_ENFORCE_LT(static_cast<size_t>(k),
ins.size(),
errors::PreconditionNotMet(
"index exceeds the number of candidate tensors."));
paddle::memory::Copy(ctx.GetPlace(),
out->data<T>() + i * cols,
ctx.GetPlace(),
ins[k]->data<T>() + i * cols,
cols * sizeof(T));
}
} }
} // namespace phi } // namespace phi
PD_REGISTER_KERNEL( PD_REGISTER_KERNEL(multiplex,
min_raw, CPU, ALL_LAYOUT, phi::MinRawKernel, float, double, int, int64_t) {} CPU,
ALL_LAYOUT,
phi::MultiplexKernel,
float,
double,
int,
int64_t) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <Eigen/Dense>
#include "paddle/phi/kernels/qr_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/complex_functors.h"
namespace phi {
static inline std::tuple<bool, bool> ParseQrMode(const std::string& mode) {
bool compute_q;
bool reduced;
if (mode == "reduced") {
compute_q = true;
reduced = true;
} else if (mode == "complete") {
compute_q = true;
reduced = false;
} else if (mode == "r") {
compute_q = false;
reduced = true;
} else {
PADDLE_THROW(errors::InvalidArgument(
"QR received unrecognized mode '%s'"
" but expected one of 'reduced' (default), 'r', or 'complete'",
mode));
}
return std::make_tuple(compute_q, reduced);
}
template <typename T, typename Context>
void QrKernel(const Context& ctx,
const DenseTensor& x,
const std::string& mode,
DenseTensor* q,
DenseTensor* r) {
bool compute_q;
bool reduced_mode;
std::tie(compute_q, reduced_mode) = ParseQrMode(mode);
auto numel = x.numel();
PADDLE_ENFORCE_GT(
numel, 0, errors::PreconditionNotMet("The input of QR is empty."));
auto x_dims = x.dims();
int x_rank = x_dims.size();
int m = x_dims[x_rank - 2];
int n = x_dims[x_rank - 1];
int min_mn = std::min(m, n);
int k = reduced_mode ? min_mn : m;
int batch_size = numel / (m * n);
int x_stride = m * n;
int q_stride = m * k;
int r_stride = k * n;
auto* x_data = x.data<phi::dtype::Real<T>>();
T* q_data = nullptr;
if (compute_q) {
q_data = ctx.template Alloc<phi::dtype::Real<T>>(
q, batch_size * m * k * sizeof(phi::dtype::Real<T>));
}
auto* r_data = ctx.template Alloc<phi::dtype::Real<T>>(
r, batch_size * k * n * sizeof(phi::dtype::Real<T>));
// Implement QR by calling Eigen
for (int i = 0; i < batch_size; ++i) {
const T* x_matrix_ptr = x_data + i * x_stride;
T* r_matrix_ptr = r_data + i * r_stride;
using EigenDynamicMatrix =
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
auto x_matrix = Eigen::Map<const EigenDynamicMatrix>(x_matrix_ptr, m, n);
Eigen::HouseholderQR<EigenDynamicMatrix> qr(x_matrix);
if (reduced_mode) {
auto qr_top_matrix = qr.matrixQR().block(0, 0, min_mn, n);
auto r_matrix_view =
qr_top_matrix.template triangularView<Eigen::Upper>();
auto r_matrix = EigenDynamicMatrix(r_matrix_view);
memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T));
} else {
auto r_matrix_view =
qr.matrixQR().template triangularView<Eigen::Upper>();
auto r_matrix = EigenDynamicMatrix(r_matrix_view);
memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T));
}
if (compute_q) {
T* q_matrix_ptr = q_data + i * q_stride;
if (reduced_mode) {
auto q_matrix =
qr.householderQ() * EigenDynamicMatrix::Identity(m, min_mn);
q_matrix.transposeInPlace();
memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T));
} else {
auto q_matrix = qr.householderQ() * EigenDynamicMatrix::Identity(m, m);
q_matrix.transposeInPlace();
memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T));
}
}
}
}
} // namespace phi
PD_REGISTER_KERNEL(qr, CPU, ALL_LAYOUT, phi::QrKernel, float, double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/reduce_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/cpu/reduce.h"
#include "paddle/phi/kernels/funcs/reduce_functor.h"
namespace phi {
template <typename T, typename Context>
void MeanRawKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
bool reduce_all,
DenseTensor* out) {
auto out_dtype = x.dtype();
phi::Reduce<CPUContext, T, phi::funcs::MeanFunctor>(
dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
}
template <typename T, typename Context>
void SumRawKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
bool reduce_all,
DataType out_dtype,
DenseTensor* out) {
phi::Reduce<CPUContext, T, phi::funcs::SumFunctor>(
dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
}
template <typename T, typename Context>
void ProdRawKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
bool reduce_all,
DenseTensor* out) {
auto out_dtype = x.dtype();
phi::Reduce<CPUContext, T, phi::funcs::ProdFunctor>(
dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
}
template <typename T, typename Context>
void MaxRawKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
bool reduce_all,
DenseTensor* out) {
auto out_dtype = x.dtype();
phi::Reduce<CPUContext, T, phi::funcs::MaxFunctor>(
dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
}
template <typename T, typename Context>
void MinRawKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
bool reduce_all,
DenseTensor* out) {
auto out_dtype = x.dtype();
phi::Reduce<CPUContext, T, phi::funcs::MinFunctor>(
dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
}
template <typename T, typename Context>
void AllRawKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
bool reduce_all,
DenseTensor* out) {
phi::BoolReduceKernel<CPUContext, T, phi::funcs::AllFunctor>(
dev_ctx, x, dims, keep_dim, reduce_all, out);
}
template <typename T, typename Context>
void AnyRawKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
bool reduce_all,
DenseTensor* out) {
phi::BoolReduceKernel<CPUContext, T, phi::funcs::AnyFunctor>(
dev_ctx, x, dims, keep_dim, reduce_all, out);
}
} // namespace phi
using complex64 = ::phi::dtype::complex<float>;
using complex128 = ::phi::dtype::complex<double>;
PD_REGISTER_KERNEL(sum_raw,
CPU,
ALL_LAYOUT,
phi::SumRawKernel,
bool,
float,
double,
phi::dtype::float16,
int16_t,
int,
int64_t,
complex64,
complex128) {
kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
}
PD_REGISTER_KERNEL(
mean_raw, CPU, ALL_LAYOUT, phi::MeanRawKernel, float, double, bool) {}
PD_REGISTER_KERNEL(prod_raw,
CPU,
ALL_LAYOUT,
phi::ProdRawKernel,
float,
double,
int,
int64_t) {}
PD_REGISTER_KERNEL(
max_raw, CPU, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {}
PD_REGISTER_KERNEL(
min_raw, CPU, ALL_LAYOUT, phi::MinRawKernel, float, double, int, int64_t) {}
PD_REGISTER_KERNEL(all_raw, CPU, ALL_LAYOUT, phi::AllRawKernel, bool) {}
PD_REGISTER_KERNEL(any_raw, CPU, ALL_LAYOUT, phi::AnyRawKernel, bool) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/roi_align_grad_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace phi {
template <class T>
void bilinear_interpolate_gradient(const int height,
const int width,
T y,
T x,
const T out_grad_this_bin,
const T count,
T* batch_grad_data) {
int x_low, y_low, x_high, y_high;
T w1, w2, w3, w4;
if (y < -1.0 || y > height || x < -1.0 || x > width) {
w1 = w2 = w3 = w4 = 0;
x_low = x_high = y_low = y_high = -1;
return;
}
y = y <= 0 ? 0 : y;
x = x <= 0 ? 0 : x;
y_low = static_cast<int>(y);
x_low = static_cast<int>(x);
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = static_cast<T>(y_low);
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = static_cast<T>(x_low);
} else {
x_high = x_low + 1;
}
T ly = y - y_low, lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
T diff1 = out_grad_this_bin * w1 / count;
T diff2 = out_grad_this_bin * w2 / count;
T diff3 = out_grad_this_bin * w3 / count;
T diff4 = out_grad_this_bin * w4 / count;
if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
*(batch_grad_data + y_low * width + x_low) += diff1;
*(batch_grad_data + y_low * width + x_high) += diff2;
*(batch_grad_data + y_high * width + x_low) += diff3;
*(batch_grad_data + y_high * width + x_high) += diff4;
}
}
template <typename T, typename Context>
void RoiAlignGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& boxes,
paddle::optional<const DenseTensor&> boxes_num,
const DenseTensor& out_grad,
int pooled_height,
int pooled_width,
float spatial_scale,
int sampling_ratio,
bool aligned,
DenseTensor* dx) {
auto in_dims = x.dims();
int channels = in_dims[1];
int height = in_dims[2];
int width = in_dims[3];
int rois_num = boxes.dims()[0];
if (!dx) {
return;
}
DenseTensor roi_batch_id_list = Empty<int>(dev_ctx, {rois_num});
int* box_batch_id_data = roi_batch_id_list.data<int>();
int boxes_batch_size;
if (boxes_num) {
boxes_batch_size = boxes_num->numel();
auto* boxes_num_data = boxes_num->data<int>();
int start = 0;
for (int n = 0; n < boxes_batch_size; ++n) {
for (int i = start; i < start + boxes_num_data[n]; ++i) {
box_batch_id_data[i] = n;
}
start += boxes_num_data[n];
}
} else {
auto boxes_lod = boxes.lod().back();
boxes_batch_size = boxes_lod.size() - 1;
for (int n = 0; n < boxes_batch_size; ++n) {
for (std::size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) {
box_batch_id_data[i] = n;
}
}
}
dev_ctx.template Alloc<T>(dx);
phi::funcs::SetConstant<Context, T> set_zero;
set_zero(dev_ctx, dx, static_cast<T>(0));
int output_grad_size = out_grad.numel();
if ((!out_grad.IsInitialized()) || (output_grad_size <= 0)) {
return;
}
const T* boxes_data = boxes.data<T>();
const T* out_grad_data = out_grad.data<T>();
T* dx_data = dev_ctx.template Alloc<T>(dx);
auto in_stride = phi::stride(x.dims());
auto roi_stride = phi::stride(boxes.dims());
auto out_stride = phi::stride(out_grad.dims());
T roi_offset = aligned ? T(0.5) : 0;
for (int n = 0; n < rois_num; ++n) {
int box_batch_idx = box_batch_id_data[n];
T roi_xmin = boxes_data[0] * spatial_scale - roi_offset;
T roi_ymin = boxes_data[1] * spatial_scale - roi_offset;
T roi_xmax = boxes_data[2] * spatial_scale - roi_offset;
T roi_ymax = boxes_data[3] * spatial_scale - roi_offset;
T roi_width = roi_xmax - roi_xmin;
T roi_height = roi_ymax - roi_ymin;
roi_width = std::max(roi_width, static_cast<T>(1.));
roi_height = std::max(roi_height, static_cast<T>(1.));
if (!aligned) {
roi_width = std::max(roi_width, static_cast<T>(1.));
roi_height = std::max(roi_height, static_cast<T>(1.));
}
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
for (int c = 0; c < channels; ++c) {
T* batch_grad_data =
dx_data + box_batch_idx * in_stride[0] + c * in_stride[1];
const T* batch_out_grad_data =
out_grad_data + n * out_stride[0] + c * out_stride[1];
for (int ph = 0; ph < pooled_height; ++ph) {
for (int pw = 0; pw < pooled_width; ++pw) {
int pool_index = ph * pooled_width + pw;
T out_grad_this_bin = batch_out_grad_data[pool_index];
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_height / pooled_height);
int roi_bin_grid_w = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_width / pooled_width);
T count = roi_bin_grid_h * roi_bin_grid_w;
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
const T y = roi_ymin + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h);
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const T x = roi_xmin + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
bilinear_interpolate_gradient(height,
width,
y,
x,
out_grad_this_bin,
count,
batch_grad_data);
}
}
}
}
}
boxes_data += roi_stride[0];
}
}
} // namespace phi
PD_REGISTER_KERNEL(roi_align_grad,
CPU,
ALL_LAYOUT,
phi::RoiAlignGradKernel,
float,
double,
int) {}
...@@ -179,7 +179,7 @@ void AvgPool(const std::vector<T>& interpolated_values, ...@@ -179,7 +179,7 @@ void AvgPool(const std::vector<T>& interpolated_values,
} }
template <typename T, typename Context> template <typename T, typename Context>
void ROIAlignKernel(const Context& dev_ctx, void RoiAlignKernel(const Context& dev_ctx,
const DenseTensor& x, const DenseTensor& x,
const DenseTensor& boxes, const DenseTensor& boxes,
paddle::optional<const DenseTensor&> boxes_num, paddle::optional<const DenseTensor&> boxes_num,
...@@ -315,4 +315,4 @@ void ROIAlignKernel(const Context& dev_ctx, ...@@ -315,4 +315,4 @@ void ROIAlignKernel(const Context& dev_ctx,
} // namespace phi } // namespace phi
PD_REGISTER_KERNEL( PD_REGISTER_KERNEL(
roi_align, CPU, ALL_LAYOUT, phi::ROIAlignKernel, float, double, int) {} roi_align, CPU, ALL_LAYOUT, phi::RoiAlignKernel, float, double, int) {}
...@@ -12,28 +12,53 @@ ...@@ -12,28 +12,53 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/phi/kernels/reduce_max_kernel.h" #include "paddle/phi/kernels/roll_grad_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/cpu/reduce.h" #include "paddle/phi/kernels/cpu/roll_kernel_impl.h"
#include "paddle/phi/kernels/funcs/reduce_functor.h"
namespace phi { namespace phi {
template <typename T, typename Context> template <typename T, typename Context>
void MaxRawKernel(const Context& dev_ctx, void RollGradKernel(const Context& dev_ctx,
const DenseTensor& x, const DenseTensor& x,
const std::vector<int64_t>& dims, const DenseTensor& out_grad,
bool keep_dim, const ScalarArray& shifts,
bool reduce_all, const std::vector<int64_t>& axis,
DenseTensor* out) { DenseTensor* x_grad) {
auto out_dtype = x.dtype(); std::vector<T> out_vec;
phi::Reduce<CPUContext, T, phi::funcs::MaxFunctor>( paddle::framework::TensorToVector(out_grad, dev_ctx, &out_vec);
dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
auto shifts_data = shifts.GetData();
size_t nums = shifts_data.size();
DDim input_dim = out_grad.dims();
auto dims = axis;
// axis = none, reshape to 1-D tensor
if (dims.size() == 0) {
dims.push_back(0l);
input_dim = phi::Dim<1>(out_vec.size());
}
for (size_t i = 0; i < nums; i++) {
ShiftAlongDim(out_vec.data(), input_dim, dims[i], 0 - shifts_data[i]);
}
dev_ctx.template Alloc<T>(x_grad);
paddle::framework::TensorFromVector(out_vec, dev_ctx, x_grad);
x_grad->Resize(out_grad.dims());
} }
} // namespace phi } // namespace phi
PD_REGISTER_KERNEL( PD_REGISTER_KERNEL(roll_grad,
max_raw, CPU, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {} CPU,
ALL_LAYOUT,
phi::RollGradKernel,
float,
double,
int,
int64_t,
phi::dtype::complex<float>,
phi::dtype::complex<double>) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/roll_kernel.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/phi/common/complex.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/cpu/roll_kernel_impl.h"
namespace phi {
template <typename T, typename Context>
void RollKernel(const Context& dev_ctx,
const DenseTensor& x,
const ScalarArray& shifts,
const std::vector<int64_t>& axis,
DenseTensor* out) {
std::vector<T> out_vec;
paddle::framework::TensorToVector(x, dev_ctx, &out_vec);
auto shifts_data = shifts.GetData();
size_t nums = shifts_data.size();
DDim input_dim = x.dims();
auto dims = axis;
// axis = none, reshape to 1-D tensor
if (dims.size() == 0) {
dims.push_back(0l);
input_dim = phi::Dim<1>(out_vec.size());
}
for (size_t i = 0; i < nums; i++) {
PADDLE_ENFORCE_EQ(
dims[i] < input_dim.size() && dims[i] >= (0 - input_dim.size()),
true,
phi::errors::OutOfRange(
"Attr(axis[%d]) is out of range, It's expected "
"to be in range of [-%d, %d]. But received Attr(axis[%d]) = %d.",
i,
input_dim.size(),
input_dim.size() - 1,
i,
dims[i]));
ShiftAlongDim(out_vec.data(), input_dim, dims[i], shifts_data[i]);
}
dev_ctx.template Alloc<T>(out);
paddle::framework::TensorFromVector(out_vec, dev_ctx, out);
out->Resize(x.dims());
}
} // namespace phi
PD_REGISTER_KERNEL(roll,
CPU,
ALL_LAYOUT,
phi::RollKernel,
float,
double,
int,
int64_t,
phi::dtype::complex<float>,
phi::dtype::complex<double>) {}
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
...@@ -13,21 +13,16 @@ ...@@ -13,21 +13,16 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include <memory>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/utils.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle { #include "paddle/phi/common/scalar_array.h"
namespace operators { #include "paddle/phi/core/dense_tensor.h"
using Tensor = framework::Tensor; namespace phi {
using LoDTensor = framework::LoDTensor;
using DDim = framework::DDim;
template <typename T> template <typename T>
inline void shift_along_dim(T* data, const DDim& input_dim, int64_t dim, inline void ShiftAlongDim(T* data,
const DDim& input_dim,
int64_t dim,
int64_t shift) { int64_t shift) {
if (dim < 0) { if (dim < 0) {
dim += input_dim.size(); dim += input_dim.size();
...@@ -78,92 +73,4 @@ inline void shift_along_dim(T* data, const DDim& input_dim, int64_t dim, ...@@ -78,92 +73,4 @@ inline void shift_along_dim(T* data, const DDim& input_dim, int64_t dim,
} }
} }
template <typename DeviceContext, typename T> } // namespace phi
class RollKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* input_var = context.InputVar("X");
auto* output_var = context.OutputVar("Out");
auto& input = input_var->Get<LoDTensor>();
auto* output = output_var->GetMutable<LoDTensor>();
std::vector<int64_t> shifts = context.Attr<std::vector<int64_t>>("shifts");
if (context.HasInput("ShiftsTensor")) {
const auto* shifts_tensor =
context.Input<framework::Tensor>("ShiftsTensor");
PADDLE_ENFORCE_EQ(
shifts_tensor->dims().size(), 1,
platform::errors::InvalidArgument(
"The rank of ShiftsTensor is expected to be 1, got %s",
shifts_tensor->dims().size()));
shifts = GetDataFromTensor<int64_t>(shifts_tensor);
}
std::vector<int64_t> dims = context.Attr<std::vector<int64_t>>("axis");
std::vector<T> out_vec;
paddle::framework::TensorToVector(input, context.device_context(),
&out_vec);
size_t nums = shifts.size();
DDim input_dim = input.dims();
// axis = none, reshape to 1-D tensor
if (dims.size() == 0) {
dims.push_back(0l);
input_dim = framework::Dim<1>(out_vec.size());
}
for (size_t i = 0; i < nums; i++) {
PADDLE_ENFORCE_EQ(
dims[i] < input_dim.size() && dims[i] >= (0 - input_dim.size()), true,
platform::errors::OutOfRange(
"Attr(axis[%d]) is out of range, It's expected "
"to be in range of [-%d, %d]. But received Attr(axis[%d]) = %d.",
i, input_dim.size(), input_dim.size() - 1, i, dims[i]));
shift_along_dim(out_vec.data(), input_dim, dims[i], shifts[i]);
}
output->mutable_data<T>(context.GetPlace());
framework::TensorFromVector(out_vec, context.device_context(), output);
output->Resize(input.dims());
}
};
template <typename DeviceContext, typename T>
class RollGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* input_var = context.InputVar(framework::GradVarName("Out"));
auto* output_var = context.OutputVar(framework::GradVarName("X"));
auto& input = input_var->Get<LoDTensor>();
auto* output = output_var->GetMutable<LoDTensor>();
std::vector<int64_t> shifts = context.Attr<std::vector<int64_t>>("shifts");
if (context.HasInput("ShiftsTensor")) {
const auto* shifts_tensor =
context.Input<framework::Tensor>("ShiftsTensor");
shifts = GetDataFromTensor<int64_t>(shifts_tensor);
}
std::vector<int64_t> dims = context.Attr<std::vector<int64_t>>("axis");
std::vector<T> out_vec;
paddle::framework::TensorToVector(input, context.device_context(),
&out_vec);
size_t nums = shifts.size();
DDim input_dim = input.dims();
// axis = none, reshape to 1-D tensor
if (dims.size() == 0) {
dims.push_back(0l);
input_dim = framework::Dim<1>(out_vec.size());
}
for (size_t i = 0; i < nums; i++) {
shift_along_dim(out_vec.data(), input_dim, dims[i], 0 - shifts[i]);
}
output->mutable_data<T>(context.GetPlace());
framework::TensorFromVector(out_vec, context.device_context(), output);
output->Resize(input.dims());
}
};
} // namespace operators
} // namespace paddle
...@@ -12,26 +12,18 @@ ...@@ -12,26 +12,18 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/phi/kernels/reduce_any_kernel.h" #include "paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h"
#include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/cpu/reduce.h"
#include "paddle/phi/kernels/funcs/reduce_functor.h"
namespace phi { PD_REGISTER_KERNEL(tril_triu_grad,
CPU,
template <typename T, typename Context> ALL_LAYOUT,
void AnyRawKernel(const Context& dev_ctx, phi::TrilTriuGradKernel,
const DenseTensor& x, bool,
const std::vector<int64_t>& dims, float,
bool keep_dim, double,
bool reduce_all, int,
DenseTensor* out) { int64_t,
phi::BoolReduceKernel<CPUContext, T, phi::funcs::AnyFunctor>( phi::dtype::float16) {}
dev_ctx, x, dims, keep_dim, reduce_all, out);
}
} // namespace phi
PD_REGISTER_KERNEL(any_raw, CPU, ALL_LAYOUT, phi::AnyRawKernel, bool) {}
...@@ -12,26 +12,18 @@ ...@@ -12,26 +12,18 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/phi/kernels/reduce_all_kernel.h" #include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h"
#include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/cpu/reduce.h"
#include "paddle/phi/kernels/funcs/reduce_functor.h"
namespace phi { PD_REGISTER_KERNEL(tril_triu,
CPU,
template <typename T, typename Context> ALL_LAYOUT,
void AllRawKernel(const Context& dev_ctx, phi::TrilTriuKernel,
const DenseTensor& x, bool,
const std::vector<int64_t>& dims, float,
bool keep_dim, double,
bool reduce_all, int,
DenseTensor* out) { int64_t,
phi::BoolReduceKernel<CPUContext, T, phi::funcs::AllFunctor>( phi::dtype::float16) {}
dev_ctx, x, dims, keep_dim, reduce_all, out);
}
} // namespace phi
PD_REGISTER_KERNEL(all_raw, CPU, ALL_LAYOUT, phi::AllRawKernel, bool) {}
...@@ -29,11 +29,17 @@ ...@@ -29,11 +29,17 @@
#include <type_traits> #include <type_traits>
#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/float16.h" #include "paddle/phi/common/float16.h"
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/enforce.h" #include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/eigen/extensions.h"
#ifdef PADDLE_WITH_XPU_KP
#define __forceinline__ __inline__
#endif
namespace phi { namespace phi {
namespace funcs { namespace funcs {
...@@ -776,6 +782,236 @@ struct ThresholdedReluGradFunctor : public BaseActivationFunctor<T> { ...@@ -776,6 +782,236 @@ struct ThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
}; };
// tanhshrink(x) = x - tanh(x)
// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
template <typename T>
struct TanhShrinkFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
out.device(d) = x - x.tanh();
}
};
template <typename T>
struct TanhShrinkGradFunctor : public BaseActivationFunctor<T> {
template <typename Device,
typename X,
typename Out,
typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) = dout * (x.tanh() * x.tanh());
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
// tanhshrink(x) = x - tanh(x)
// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
template <typename T>
struct HardShrinkFunctor : public BaseActivationFunctor<T> {
float threshold;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"threshold", &threshold}};
}
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
auto temp1 = x < static_cast<T>(threshold * -1.f);
auto temp2 = x > static_cast<T>(threshold);
out.device(d) = x * (temp1 || temp2).template cast<T>();
}
};
template <typename T>
struct HardShrinkGradFunctor : public BaseActivationFunctor<T> {
float threshold;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"threshold", &threshold}};
}
template <typename Device,
typename X,
typename Out,
typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
auto temp1 = x < static_cast<T>(threshold * -1.f);
auto temp2 = x > static_cast<T>(threshold);
dx.device(d) = dout * (temp1 || temp2).template cast<T>();
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
// softshrink(x) = x - lambda, if x > lambda; x + lambda, if x < -lambda; 0
// otherwise
template <typename T>
struct SoftShrinkFunctor : public BaseActivationFunctor<T> {
float lambda;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"lambda", &lambda}};
}
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
auto lambdaT = static_cast<T>(lambda);
auto temp1 = (x > lambdaT).template cast<T>();
auto temp2 = (x < -lambdaT).template cast<T>();
out.device(d) = temp1 * (x - lambdaT) + temp2 * (x + lambdaT);
}
};
template <typename T>
struct SoftShrinkGradFunctor : public BaseActivationFunctor<T> {
float lambda;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"lambda", &lambda}};
}
template <typename Device,
typename X,
typename Out,
typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
auto lambdaT = static_cast<T>(lambda);
auto temp1 = (x > lambdaT).template cast<T>();
auto temp2 = (x < -lambdaT).template cast<T>();
dx.device(d) = dout * (temp1 + temp2).template cast<T>();
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct ELUFunctor : public BaseActivationFunctor<T> {
float alpha;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"alpha", &alpha}};
}
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
out.device(d) =
(x < static_cast<T>(0))
.select(static_cast<T>(alpha) * (x.exp() - static_cast<T>(1)), x);
}
};
template <typename T>
struct ELUGradFunctor : public BaseActivationFunctor<T> {
float alpha;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"alpha", &alpha}};
}
template <typename Device,
typename X,
typename Out,
typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
// case 1: alpha >= 0
// dx = dout, if out > 0
// dx = dout * (out + alpha), if out <= 0
dx.device(d) = (out > static_cast<T>(0))
.select(dout, dout * (out + static_cast<T>(alpha)));
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct ELUGradNegativeAlphaFunctor : public BaseActivationFunctor<T> {
float alpha;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"alpha", &alpha}};
}
template <typename Device,
typename X,
typename Out,
typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
// case 2: alpha < 0
// dx = dout, if x > 0
// dx = dout * (out + alpha), if x <=0
dx.device(d) = (x > static_cast<T>(0))
.select(dout, dout * static_cast<T>(alpha) * x.exp());
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct ELUGradGradFunctor : public BaseActivationFunctor<T> {
float alpha;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"alpha", &alpha}};
}
template <typename Device>
void operator()(const Device& dev,
const DenseTensor* X,
const DenseTensor* ddX,
DenseTensor* ddOut,
const DenseTensor* dOut,
DenseTensor* dX) const {
auto* d = dev.eigen_device();
auto ddx = EigenVector<T>::Flatten(
GET_DATA_SAFELY(ddX, "Input", "DDX", "ELUGradGrad"));
auto x = EigenVector<T>::Flatten(
GET_DATA_SAFELY(X, "Input", "X", "ELUGradGrad"));
if (dX) {
auto dx = EigenVector<T>::Flatten(
GET_DATA_SAFELY(dX, "Output", "DX", "ELUGradGrad"));
auto dout = EigenVector<T>::Flatten(
GET_DATA_SAFELY(dOut, "Output", "DOut", "ELUGradGrad"));
dx.device(*d) = ddx * dout * static_cast<T>(alpha) * x.exp() *
(x <= static_cast<T>(0)).template cast<T>();
}
if (ddOut) {
auto ddout = EigenVector<T>::Flatten(
GET_DATA_SAFELY(ddOut, "Output", "DDOut", "ELUGradGrad"));
ddout.device(*d) = ddx *
((x > static_cast<T>(0)).template cast<T>() +
static_cast<T>(alpha) * x.exp() *
(x <= static_cast<T>(0)).template cast<T>())
.template cast<T>();
}
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
// silu(x) = x / (1 + exp(-x))
template <typename T>
struct SiluFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
auto temp = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
out.device(d) = x * temp;
}
};
// silu'(x) = (1 / (1 + e^{-x})) * (1 + out * e^{-x}))
template <typename T>
struct SiluGradFunctor : public BaseActivationFunctor<T> {
template <typename Device,
typename X,
typename Out,
typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
auto temp1 = static_cast<T>(1) + (-x).exp(); // 1+e^(-x)
auto temp2 = x * (-x).exp(); // x*e^(-x)
dx.device(d) = dout * ((static_cast<T>(1) / temp1) *
(static_cast<T>(1) + (temp2 / temp1)));
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) #if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
template <typename T> template <typename T>
struct CudaReluFunctor : public BaseActivationFunctor<T> { struct CudaReluFunctor : public BaseActivationFunctor<T> {
...@@ -1214,6 +1450,209 @@ struct CudaLeakyReluGradFunctor : public BaseActivationFunctor<T> { ...@@ -1214,6 +1450,209 @@ struct CudaLeakyReluGradFunctor : public BaseActivationFunctor<T> {
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
}; };
template <typename T>
struct CudaSoftShrinkFunctor : public BaseActivationFunctor<T> {
float lambda;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"lambda", &lambda}};
}
// softshrink(x) = x - lambda, if x > lambda;
// x + lambda, if x < -lambda;
// 0, otherwise.
__device__ __forceinline__ T operator()(const T x) const {
T l = static_cast<T>(lambda);
T temp1 = static_cast<T>(x > l);
T temp2 = static_cast<T>(x < -l);
return temp1 * (x - l) + temp2 * (x + l);
}
};
template <typename T>
struct CudaSoftShrinkGradFunctor : public BaseActivationFunctor<T> {
T zero = static_cast<T>(0.0f);
float lambda;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"lambda", &lambda}};
}
// dx = dout, if x > lambda or x < -lambda else 0
__device__ __forceinline__ T operator()(const T dout, const T x) const {
T l = static_cast<T>(lambda);
return (x >= -l && x <= l) ? zero : dout;
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct CudaTanhShrinkFunctor : public BaseActivationFunctor<T> {
using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
// tanhshrink(x) = x - tanh(x)
__device__ __forceinline__ T operator()(const T arg_x) const {
MPType x = static_cast<MPType>(arg_x);
return static_cast<T>(x - tanh(x));
}
};
template <typename T>
struct CudaTanhShrinkGradFunctor : public BaseActivationFunctor<T> {
using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
// dx = dout * tanh(x)^2
__device__ __forceinline__ T operator()(const T arg_dout,
const T arg_x) const {
MPType dout = static_cast<MPType>(arg_dout);
MPType x = static_cast<MPType>(arg_x);
return static_cast<T>(dout * tanh(x) * tanh(x));
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct CudaHardShrinkFunctor : public BaseActivationFunctor<T> {
T zero = static_cast<T>(0.0f);
float threshold;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"threshold", &threshold}};
}
// hadrshrink(x) = (x > -threshold && x < threshold) ? 0 : x
__device__ __forceinline__ T operator()(const T x) const {
T t = static_cast<T>(threshold);
return (x > -t && x < t) ? zero : x;
}
};
template <typename T>
struct CudaHardShrinkGradFunctor : public BaseActivationFunctor<T> {
T zero = static_cast<T>(0.0f);
float threshold;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"threshold", &threshold}};
}
// dx = (x > -threshold && x < threshold) ? 0 : dout
__device__ __forceinline__ T operator()(const T dout, const T x) const {
T t = static_cast<T>(threshold);
return (x > -t && x < t) ? zero : dout;
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct CudaELUFunctor : public BaseActivationFunctor<T> {
using CT = typename phi::dtype::MPTypeTrait<T>::Type;
CT zero = static_cast<CT>(0.0f);
CT one = static_cast<CT>(1.0f);
float alpha;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"alpha", &alpha}};
}
// elu(x) = x, if x > 0
// elu(x) = alpha * (e^x - 1), if x <= 0
__device__ __forceinline__ T operator()(const T arg_x) const {
CT x = static_cast<CT>(arg_x);
CT temp = static_cast<CT>(alpha) * (exp(x) - one);
CT res = x > zero ? x : temp;
return static_cast<T>(res);
}
};
template <typename T>
struct CudaELUGradFunctor : public BaseActivationFunctor<T> {
using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
MPType zero = static_cast<MPType>(0.0f);
float alpha;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"alpha", &alpha}};
}
// case 1: alpha >= 0
// dx = dout, if out > 0
// dx = dout * (out + alpha), if out <= 0
__device__ __forceinline__ T operator()(T arg_dout, T arg_out) const {
MPType dout = static_cast<MPType>(arg_dout);
MPType out = static_cast<MPType>(arg_out);
MPType a = static_cast<MPType>(alpha);
MPType out_pos = static_cast<MPType>(out > zero);
MPType out_neg = static_cast<MPType>(out <= zero);
return static_cast<T>(dout * (out_pos + out_neg * (out + a)));
}
static constexpr ActBwdOpFwdDeps FwdDeps() {
return ActBwdOpFwdDeps::kDepOut;
}
};
template <typename T>
struct CudaELUGradNegativeAlphaFunctor : public BaseActivationFunctor<T> {
using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
MPType zero = static_cast<MPType>(0.0f);
float alpha;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"alpha", &alpha}};
}
// case 2: alpha < 0
// dx = dout, if x > 0
// dx = dout * (out + alpha), if x <=0
__device__ __forceinline__ T operator()(const T arg_dout,
const T arg_out,
const T arg_x) const {
MPType dout = static_cast<MPType>(arg_dout);
MPType out = static_cast<MPType>(arg_out);
MPType x = static_cast<MPType>(arg_x);
MPType a = static_cast<MPType>(alpha);
MPType x_pos = static_cast<MPType>(x > zero);
MPType x_neg = static_cast<MPType>(x <= zero);
return static_cast<T>(dout * (x_pos + x_neg * (out + a)));
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct CudaSiluFunctor : public BaseActivationFunctor<T> {
using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
MPType one = static_cast<MPType>(1.0f);
// silu(x) = x / (1 + exp(-x))
__device__ __forceinline__ T operator()(const T arg_x) const {
MPType x = static_cast<MPType>(arg_x);
return static_cast<T>(x / (one + exp(-x)));
}
};
template <typename T>
struct CudaSiluGradFunctor : public BaseActivationFunctor<T> {
using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
MPType one = static_cast<MPType>(1.0f);
// dx = dout * (1 + exp(-x) + x * exp(-x) / (1 + exp(-x))^2)
__device__ __forceinline__ T operator()(const T arg_dout,
const T arg_x) const {
MPType dout = static_cast<MPType>(arg_dout);
MPType x = static_cast<MPType>(arg_x);
MPType temp = one / (one + exp(-x));
return static_cast<T>(dout * (temp * (one + x * (one - temp))));
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
#endif #endif
} // namespace funcs } // namespace funcs
......
...@@ -93,7 +93,7 @@ inline HOSTDEVICE void IndexToPoint( ...@@ -93,7 +93,7 @@ inline HOSTDEVICE void IndexToPoint(
} }
inline void GetOutShape(const DDim& x_dims, inline void GetOutShape(const DDim& x_dims,
const DDim& kernel_dims, const std::vector<int>& kernel_sizes,
const std::vector<int>& paddings, const std::vector<int>& paddings,
const std::vector<int>& dilations, const std::vector<int>& dilations,
const std::vector<int>& strides, const std::vector<int>& strides,
...@@ -102,17 +102,17 @@ inline void GetOutShape(const DDim& x_dims, ...@@ -102,17 +102,17 @@ inline void GetOutShape(const DDim& x_dims,
x_dims.size(), x_dims.size(),
5, 5,
phi::errors::InvalidArgument("the shape of x should be (N, D, H, W, C)")); phi::errors::InvalidArgument("the shape of x should be (N, D, H, W, C)"));
PADDLE_ENFORCE_EQ(kernel_dims.size(), PADDLE_ENFORCE_EQ(kernel_sizes.size(),
5, 5,
phi::errors::InvalidArgument( phi::errors::InvalidArgument(
"the shape of kernel should be (D, H, W, C, OC)")); "the shape of kernel should be (D, H, W, C, OC)"));
// infer out shape // infer out shape
(*out_dims)[0] = x_dims[0]; (*out_dims)[0] = x_dims[0];
(*out_dims)[4] = kernel_dims[4]; (*out_dims)[4] = kernel_sizes[4];
for (int i = 1; i < 4; i++) { for (int i = 1; i < 4; i++) {
(*out_dims)[i] = (x_dims[i] + 2 * paddings[i - 1] - (*out_dims)[i] = (x_dims[i] + 2 * paddings[i - 1] -
dilations[i - 1] * (kernel_dims[i - 1] - 1) - 1) / dilations[i - 1] * (kernel_sizes[i - 1] - 1) - 1) /
strides[i - 1] + strides[i - 1] +
1; 1;
} }
...@@ -131,7 +131,7 @@ template <typename T, typename Context> ...@@ -131,7 +131,7 @@ template <typename T, typename Context>
inline void SubmPreProcess(const Context& dev_ctx, inline void SubmPreProcess(const Context& dev_ctx,
const SparseCooTensor& x, const SparseCooTensor& x,
const DenseTensor& kernel, const DenseTensor& kernel,
const SparseCooTensor& out_grad, const DenseTensor& out_grad,
const int in_channels, const int in_channels,
const int out_channels, const int out_channels,
const int half_kernel_size, const int half_kernel_size,
...@@ -142,11 +142,11 @@ inline void SubmPreProcess(const Context& dev_ctx, ...@@ -142,11 +142,11 @@ inline void SubmPreProcess(const Context& dev_ctx,
blas.GEMM(CblasTrans, blas.GEMM(CblasTrans,
CblasNoTrans, CblasNoTrans,
x.non_zero_elements().dims()[1], x.non_zero_elements().dims()[1],
out_grad.non_zero_elements().dims()[1], out_grad.dims()[1],
x.non_zero_elements().dims()[0], x.non_zero_elements().dims()[0],
static_cast<T>(1), static_cast<T>(1),
x.non_zero_elements().data<T>(), x.non_zero_elements().data<T>(),
out_grad.non_zero_elements().data<T>(), out_grad.data<T>(),
static_cast<T>(0), static_cast<T>(0),
d_kernel_ptr + half_kernel_size * in_channels * out_channels); d_kernel_ptr + half_kernel_size * in_channels * out_channels);
...@@ -155,11 +155,11 @@ inline void SubmPreProcess(const Context& dev_ctx, ...@@ -155,11 +155,11 @@ inline void SubmPreProcess(const Context& dev_ctx,
T* x_grad_ptr = x_grad->data<T>(); T* x_grad_ptr = x_grad->data<T>();
blas.GEMM(CblasNoTrans, blas.GEMM(CblasNoTrans,
CblasTrans, CblasTrans,
out_grad.non_zero_elements().dims()[0], out_grad.dims()[0],
in_channels, in_channels,
out_grad.non_zero_elements().dims()[1], out_grad.dims()[1],
static_cast<T>(1), static_cast<T>(1),
out_grad.non_zero_elements().data<T>(), out_grad.data<T>(),
kernel.data<T>() + half_kernel_size * in_channels * out_channels, kernel.data<T>() + half_kernel_size * in_channels * out_channels,
static_cast<T>(0), static_cast<T>(0),
x_grad_ptr); x_grad_ptr);
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
namespace phi {
namespace funcs {
template <typename T>
class TrilTriuCompute {
public:
HOSTDEVICE TrilTriuCompute(const T* in,
const int diagonal,
const bool lower,
const int64_t H,
const int64_t W,
T* out)
: in_(in), diagonal_(diagonal), lower_(lower), H_(H), W_(W), out_(out) {}
HOSTDEVICE void operator()(int64_t idx) {
const int64_t row = (idx / W_) % H_;
const int64_t col = idx % W_;
const bool mask =
lower_ ? (col - row > diagonal_) : (col - row < diagonal_);
out_[idx] = mask ? static_cast<T>(0) : in_[idx];
}
private:
const T* in_;
const int diagonal_;
const bool lower_;
const int64_t H_;
const int64_t W_;
T* out_;
};
} // namespace funcs
} // namespace phi
...@@ -73,7 +73,7 @@ void ActivationGradGPUImpl(const Context& dev_ctx, ...@@ -73,7 +73,7 @@ void ActivationGradGPUImpl(const Context& dev_ctx,
} }
} }
#define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(name, functor_class) \ #define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(name, functor_class) \
template <typename T, typename Context> \ template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \ void name##GradKernel(const Context& dev_ctx, \
const DenseTensor& x, \ const DenseTensor& x, \
...@@ -84,7 +84,7 @@ void ActivationGradGPUImpl(const Context& dev_ctx, ...@@ -84,7 +84,7 @@ void ActivationGradGPUImpl(const Context& dev_ctx,
dev_ctx, &x, nullptr, &dout, dx, functor); \ dev_ctx, &x, nullptr, &dout, dx, functor); \
} }
#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX( \ #define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX( \
name, functor_class, attr) \ name, functor_class, attr) \
template <typename T, typename Context> \ template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \ void name##GradKernel(const Context& dev_ctx, \
...@@ -99,7 +99,7 @@ void ActivationGradGPUImpl(const Context& dev_ctx, ...@@ -99,7 +99,7 @@ void ActivationGradGPUImpl(const Context& dev_ctx,
dev_ctx, &x, nullptr, &dout, dx, functor); \ dev_ctx, &x, nullptr, &dout, dx, functor); \
} }
#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX( \ #define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX( \
name, functor_class, attr1, attr2) \ name, functor_class, attr1, attr2) \
template <typename T, typename Context> \ template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \ void name##GradKernel(const Context& dev_ctx, \
...@@ -116,7 +116,7 @@ void ActivationGradGPUImpl(const Context& dev_ctx, ...@@ -116,7 +116,7 @@ void ActivationGradGPUImpl(const Context& dev_ctx,
dev_ctx, &x, nullptr, &dout, dx, functor); \ dev_ctx, &x, nullptr, &dout, dx, functor); \
} }
#define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(name, functor_class) \ #define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(name, functor_class) \
template <typename T, typename Context> \ template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \ void name##GradKernel(const Context& dev_ctx, \
const DenseTensor& out, \ const DenseTensor& out, \
...@@ -127,7 +127,7 @@ void ActivationGradGPUImpl(const Context& dev_ctx, ...@@ -127,7 +127,7 @@ void ActivationGradGPUImpl(const Context& dev_ctx,
dev_ctx, nullptr, &out, &dout, dx, functor); \ dev_ctx, nullptr, &out, &dout, dx, functor); \
} }
#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepOut( \ #define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT( \
name, functor_class, attr) \ name, functor_class, attr) \
template <typename T, typename Context> \ template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \ void name##GradKernel(const Context& dev_ctx, \
...@@ -142,32 +142,62 @@ void ActivationGradGPUImpl(const Context& dev_ctx, ...@@ -142,32 +142,62 @@ void ActivationGradGPUImpl(const Context& dev_ctx,
dev_ctx, nullptr, &out, &dout, dx, functor); \ dev_ctx, nullptr, &out, &dout, dx, functor); \
} }
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, CudaReluGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu, CudaReluGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(Tanh, CudaTanhGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh, CudaTanhGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, CudaCosGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Cos, CudaCosGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, CudaTanGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Tan, CudaTanGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, CudaAcosGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Acos, CudaAcosGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, CudaSinGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Sin, CudaSinGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, CudaAsinGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Asin, CudaAsinGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, CudaAtanGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Atan, CudaAtanGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, CudaSinhGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Sinh, CudaSinhGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, CudaCoshGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Cosh, CudaCoshGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, CudaAsinhGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Asinh, CudaAsinhGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, CudaAcoshGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Acosh, CudaAcoshGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, CudaAtanhGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Atanh, CudaAtanhGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink, CudaTanhShrinkGradFunctor);
DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(LeakyRelu, DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, CudaSiluGradFunctor);
DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu,
CudaLeakyReluGradFunctor, CudaLeakyReluGradFunctor,
alpha); alpha);
DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(ThresholdedRelu, DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(ThresholdedRelu,
CudaThresholdedReluGradFunctor, CudaThresholdedReluGradFunctor,
threshold); threshold);
DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink,
CudaSoftShrinkGradFunctor,
lambda);
DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink,
CudaHardShrinkGradFunctor,
threshold);
DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX(BRelu, DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu,
CudaBReluGradFunctor, CudaBReluGradFunctor,
t_min, t_min,
t_max); t_max);
template <typename T, typename Context>
void EluGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& out,
const DenseTensor& dout,
float alpha,
DenseTensor* dx) {
dev_ctx.template Alloc<T>(dx);
std::vector<const DenseTensor*> ins = {&dout, &out};
std::vector<DenseTensor*> outs = {dx};
if (alpha > 0) {
funcs::CudaELUGradFunctor<T> functor;
functor.alpha = alpha;
funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, functor);
} else {
funcs::CudaELUGradNegativeAlphaFunctor<T> functor;
functor.alpha = alpha;
ins.push_back(&x);
funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, functor);
}
}
} // namespace phi } // namespace phi
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
...@@ -234,3 +264,9 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_double_grad, ...@@ -234,3 +264,9 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_double_grad,
LeakyReluDoubleGradKernel) LeakyReluDoubleGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(thresholded_relu_grad, PD_REGISTER_ACTIVATION_GRAD_KERNEL(thresholded_relu_grad,
ThresholdedReluGradKernel) ThresholdedReluGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(soft_shrink_grad, SoftShrinkGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_shrink_grad, HardShrinkGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_shrink_grad, TanhShrinkGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(silu_grad, SiluGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_grad, EluGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_double_grad, EluDoubleGradKernel)
...@@ -42,8 +42,9 @@ void ActivationGPUImpl(const Context& dev_ctx, ...@@ -42,8 +42,9 @@ void ActivationGPUImpl(const Context& dev_ctx,
template <typename T, typename Context> \ template <typename T, typename Context> \
void name##Kernel( \ void name##Kernel( \
const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { \ const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { \
functor_class functor; \ funcs::functor_class<T> functor; \
ActivationGPUImpl<T, Context, functor_class>(dev_ctx, x, out, functor); \ ActivationGPUImpl<T, Context, funcs::functor_class<T>>( \
dev_ctx, x, out, functor); \
} }
#define DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(name, functor_class, attr) \ #define DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(name, functor_class, attr) \
...@@ -75,24 +76,31 @@ void ActivationGPUImpl(const Context& dev_ctx, ...@@ -75,24 +76,31 @@ void ActivationGPUImpl(const Context& dev_ctx,
dev_ctx, x, out, functor); \ dev_ctx, x, out, functor); \
} }
DEFINE_GPU_ACTIVATION_KERNEL(Cos, funcs::CudaCosFunctor<T>) DEFINE_GPU_ACTIVATION_KERNEL(Cos, CudaCosFunctor)
DEFINE_GPU_ACTIVATION_KERNEL(Tan, funcs::CudaTanFunctor<T>) DEFINE_GPU_ACTIVATION_KERNEL(Tan, CudaTanFunctor)
DEFINE_GPU_ACTIVATION_KERNEL(Acos, funcs::CudaAcosFunctor<T>) DEFINE_GPU_ACTIVATION_KERNEL(Acos, CudaAcosFunctor)
DEFINE_GPU_ACTIVATION_KERNEL(Sin, funcs::CudaSinFunctor<T>) DEFINE_GPU_ACTIVATION_KERNEL(Sin, CudaSinFunctor)
DEFINE_GPU_ACTIVATION_KERNEL(Asin, funcs::CudaAsinFunctor<T>) DEFINE_GPU_ACTIVATION_KERNEL(Asin, CudaAsinFunctor)
DEFINE_GPU_ACTIVATION_KERNEL(Atan, funcs::CudaAtanFunctor<T>) DEFINE_GPU_ACTIVATION_KERNEL(Atan, CudaAtanFunctor)
DEFINE_GPU_ACTIVATION_KERNEL(Sinh, funcs::CudaSinhFunctor<T>) DEFINE_GPU_ACTIVATION_KERNEL(Sinh, CudaSinhFunctor)
DEFINE_GPU_ACTIVATION_KERNEL(Cosh, funcs::CudaCoshFunctor<T>) DEFINE_GPU_ACTIVATION_KERNEL(Cosh, CudaCoshFunctor)
DEFINE_GPU_ACTIVATION_KERNEL(Asinh, funcs::CudaAsinhFunctor<T>) DEFINE_GPU_ACTIVATION_KERNEL(Asinh, CudaAsinhFunctor)
DEFINE_GPU_ACTIVATION_KERNEL(Acosh, funcs::CudaAcoshFunctor<T>) DEFINE_GPU_ACTIVATION_KERNEL(Acosh, CudaAcoshFunctor)
DEFINE_GPU_ACTIVATION_KERNEL(Atanh, funcs::CudaAtanhFunctor<T>) DEFINE_GPU_ACTIVATION_KERNEL(Atanh, CudaAtanhFunctor)
DEFINE_GPU_ACTIVATION_KERNEL(Relu, funcs::CudaReluFunctor<T>) DEFINE_GPU_ACTIVATION_KERNEL(Relu, CudaReluFunctor)
DEFINE_GPU_ACTIVATION_KERNEL(Tanh, funcs::CudaTanhFunctor<T>) DEFINE_GPU_ACTIVATION_KERNEL(Tanh, CudaTanhFunctor)
DEFINE_GPU_ACTIVATION_KERNEL(TanhShrink, CudaTanhShrinkFunctor)
DEFINE_GPU_ACTIVATION_KERNEL(Silu, CudaSiluFunctor)
DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, CudaLeakyReluFunctor, alpha) DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, CudaLeakyReluFunctor, alpha)
DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu, DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu,
CudaThresholdedReluFunctor, CudaThresholdedReluFunctor,
threshold) threshold)
DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(HardShrink,
CudaHardShrinkFunctor,
threshold)
DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(SoftShrink, CudaSoftShrinkFunctor, lambda)
DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(Elu, CudaELUFunctor, alpha)
DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, CudaBReluFunctor, t_min, t_max) DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, CudaBReluFunctor, t_min, t_max)
...@@ -142,3 +150,8 @@ PD_REGISTER_ACTIVATION_KERNEL(tanh, TanhKernel) ...@@ -142,3 +150,8 @@ PD_REGISTER_ACTIVATION_KERNEL(tanh, TanhKernel)
PD_REGISTER_ACTIVATION_KERNEL(brelu, BReluKernel) PD_REGISTER_ACTIVATION_KERNEL(brelu, BReluKernel)
PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedReluKernel) PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedReluKernel)
PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel) PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel)
PD_REGISTER_ACTIVATION_KERNEL(hard_shrink, HardShrinkKernel)
PD_REGISTER_ACTIVATION_KERNEL(soft_shrink, SoftShrinkKernel)
PD_REGISTER_ACTIVATION_KERNEL(tanh_shrink, TanhShrinkKernel)
PD_REGISTER_ACTIVATION_KERNEL(elu, EluKernel)
PD_REGISTER_ACTIVATION_KERNEL(silu, SiluKernel)
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/grid_sample_grad_kernel.h"
#include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/gpu/grid_sample_utils.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
namespace phi {
template <typename T>
static __forceinline__ __device__ void AtomicAdd(
T* data, int h, int w, int sH, int sW, int H, int W, T delta) {
if (InBounds(h, w, H, W)) {
paddle::platform::CudaAtomicAdd(data + h * sH + w * sW, delta);
}
}
template <typename T>
static __forceinline__ __device__ T
UnnormalizeWithMask(T coord, int size, bool align_corners, T* grad_in) {
if (align_corners) {
*grad_in = static_cast<T>(size - 1) / 2;
return ((coord + 1.f) / 2) * (size - 1);
} else {
*grad_in = static_cast<T>(size) / 2;
return ((coord + 1.f) * size - 1) / 2;
}
}
template <typename T>
static __forceinline__ __device__ T ClipIndexesWithMask(T in,
int clip_limit,
T* grad_in) {
if (in <= static_cast<T>(0)) {
*grad_in = static_cast<T>(0);
return static_cast<T>(0);
} else {
T max = static_cast<T>(clip_limit - 1);
if (in >= max) {
*grad_in = static_cast<T>(0);
return max;
} else {
*grad_in = static_cast<T>(1);
return in;
}
}
}
template <typename T>
static __forceinline__ __device__ T
ReflectIndexesWithMask(T in, int twice_low, int twice_high, T* grad_in) {
if (twice_low == twice_high) {
*grad_in = static_cast<T>(0);
return static_cast<T>(0);
}
int grad_in_mult_;
T min = static_cast<T>(twice_low) / 2;
T span = static_cast<T>(twice_high - twice_low) / 2;
in = in - min;
if (in < static_cast<T>(0)) {
grad_in_mult_ = -1;
in = -in;
} else {
grad_in_mult_ = 1;
}
T extra = fmod(in, span);
int flips = static_cast<int>(floor(in / span));
if (flips % 2 == 0) {
*grad_in = static_cast<T>(grad_in_mult_);
return extra + min;
} else {
*grad_in = static_cast<T>(-grad_in_mult_);
return span - extra + min;
}
}
template <typename T>
static __forceinline__ __device__ T
ComputePositionsWithMask(T coord,
int size,
PaddingMode padding_mode,
bool align_corners,
T* grad_in) {
T grad_clip, grad_refl;
coord = UnnormalizeWithMask<T>(coord, size, align_corners, grad_in);
if (padding_mode == PaddingMode::border) {
coord = ClipIndexesWithMask(coord, size, &grad_clip);
*grad_in = (*grad_in) * grad_clip;
} else if (padding_mode == PaddingMode::reflect) {
if (align_corners) {
coord = ReflectIndexesWithMask(coord, 0, 2 * (size - 1), &grad_refl);
} else {
coord = ReflectIndexesWithMask(coord, -1, 2 * size - 1, &grad_refl);
}
coord = ClipIndexesWithMask(coord, size, &grad_clip);
*grad_in = (*grad_in) * grad_refl * grad_clip;
}
return coord;
}
template <typename T>
__global__ void GridSamplerCudaBackwardKernel(const int nthreads,
const T* grad_output,
const T* input,
const T* grid,
int n,
int out_c,
int out_h,
int out_w,
int in_h,
int in_w,
T* grad_input,
T* grad_grid,
const Mode mode,
const PaddingMode padding_mode,
bool align_corners) {
int inp_sN = out_c * in_h * in_w;
int inp_sC = in_h * in_w;
int inp_sH = in_w;
int inp_sW = 1;
int grid_sN = out_h * out_w * 2;
int grid_sH = out_w * 2;
int grid_sW = 2;
int grid_sCoor = 1;
int gOut_sN = out_c * out_h * out_w;
int gOut_sC = out_h * out_w;
int gOut_sH = out_w;
int gOut_sW = 1;
CUDA_KERNEL_LOOP(index, nthreads) {
const int w = index % out_w;
const int h = (index / out_w) % out_h;
const int n = index / (out_h * out_w);
const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
T ix = grid[grid_offset];
T iy = grid[grid_offset + grid_sCoor];
T gix_mult, giy_mult;
ix = ComputePositionsWithMask(
ix, in_w, padding_mode, align_corners, &gix_mult);
iy = ComputePositionsWithMask(
iy, in_h, padding_mode, align_corners, &giy_mult);
if (mode == Mode::bilinear) {
int ix_nw = static_cast<int>(floor(ix));
int iy_nw = static_cast<int>(floor(iy));
int ix_ne = ix_nw + 1;
int iy_ne = iy_nw;
int ix_sw = ix_nw;
int iy_sw = iy_nw + 1;
int ix_se = ix_nw + 1;
int iy_se = iy_nw + 1;
T nw = (ix_se - ix) * (iy_se - iy);
T ne = (ix - ix_sw) * (iy_sw - iy);
T sw = (ix_ne - ix) * (iy - iy_ne);
T se = (ix - ix_nw) * (iy - iy_nw);
T gix = static_cast<T>(0), giy = static_cast<T>(0);
int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW;
T* gInp_ptr_NC = grad_input + n * inp_sN;
int inp_offset_NC = n * inp_sN;
for (int c = 0; c < out_c; ++c,
inp_offset_NC += inp_sC,
gInp_ptr_NC += inp_sC,
gOut_offset += gOut_sC) {
T gOut = grad_output[gOut_offset];
AtomicAdd(
gInp_ptr_NC, iy_nw, ix_nw, inp_sH, inp_sW, in_h, in_w, nw * gOut);
AtomicAdd(
gInp_ptr_NC, iy_ne, ix_ne, inp_sH, inp_sW, in_h, in_w, ne * gOut);
AtomicAdd(
gInp_ptr_NC, iy_sw, ix_sw, inp_sH, inp_sW, in_h, in_w, sw * gOut);
AtomicAdd(
gInp_ptr_NC, iy_se, ix_se, inp_sH, inp_sW, in_h, in_w, se * gOut);
if (InBounds(iy_nw, ix_nw, in_h, in_w)) {
T nw_val = input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW];
gix -= nw_val * (iy_se - iy) * gOut;
giy -= nw_val * (ix_se - ix) * gOut;
}
if (InBounds(iy_ne, ix_ne, in_h, in_w)) {
T ne_val = input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW];
gix += ne_val * (iy_sw - iy) * gOut;
giy -= ne_val * (ix - ix_sw) * gOut;
}
if (InBounds(iy_sw, ix_sw, in_h, in_w)) {
T sw_val = input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW];
gix -= sw_val * (iy - iy_ne) * gOut;
giy += sw_val * (ix_ne - ix) * gOut;
}
if (InBounds(iy_se, ix_se, in_h, in_w)) {
T se_val = input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW];
gix += se_val * (iy - iy_nw) * gOut;
giy += se_val * (ix - ix_nw) * gOut;
}
}
if (grad_grid != nullptr) {
T* gGrid_ptr_NHW = grad_grid + index * grid_sW;
gGrid_ptr_NHW[0] = gix_mult * gix;
gGrid_ptr_NHW[1] = giy_mult * giy;
}
} else if (mode == Mode::nearest) {
int ix_nearest = static_cast<int>(std::nearbyint(ix));
int iy_nearest = static_cast<int>(std::nearbyint(iy));
int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW;
T* gInp_ptr_NC = grad_input + n * inp_sN;
for (int c = 0; c < out_c;
++c, gInp_ptr_NC += inp_sC, gOut_offset += gOut_sC) {
AtomicAdd(gInp_ptr_NC,
iy_nearest,
ix_nearest,
inp_sH,
inp_sW,
in_h,
in_w,
grad_output[gOut_offset]);
}
if (grad_grid != nullptr) {
T* gGrid_ptr_NHW = grad_grid + index * grid_sW;
gGrid_ptr_NHW[0] = static_cast<T>(0);
gGrid_ptr_NHW[1] = static_cast<T>(0);
}
}
}
}
template <typename T, typename Context>
void GridSampleGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& grid,
const DenseTensor& out_grad,
const std::string& mode,
const std::string& padding_mode,
bool align_corners,
DenseTensor* x_grad,
DenseTensor* grid_grad) {
PaddingMode enum_padding_mode;
Mode enum_mode;
if (padding_mode == "border") {
enum_padding_mode = PaddingMode::border;
} else if (padding_mode == "reflection") {
enum_padding_mode = PaddingMode::reflect;
} else {
enum_padding_mode = PaddingMode::zeros;
}
if (mode == "nearest") {
enum_mode = Mode::nearest;
} else {
enum_mode = Mode::bilinear;
}
const int n = grid.dims()[0];
const int out_h = grid.dims()[1];
const int out_w = grid.dims()[2];
const int c = x.dims()[1];
const int in_h = x.dims()[2];
const int in_w = x.dims()[3];
dev_ctx.template Alloc<T>(x_grad);
phi::funcs::SetConstant<Context, T>()(dev_ctx, x_grad, static_cast<T>(0));
T* grid_grad_data = nullptr;
if (grid_grad != nullptr) {
grid_grad_data = dev_ctx.template Alloc<T>(grid_grad);
}
int count = static_cast<int>(n * out_h * out_w);
auto cu_stream = dev_ctx.stream();
backends::gpu::GpuLaunchConfig config =
backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count);
GridSamplerCudaBackwardKernel<
T><<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
count,
out_grad.data<T>(),
x.data<T>(),
grid.data<T>(),
n,
c,
out_h,
out_w,
in_h,
in_w,
x_grad->data<T>(),
grid_grad_data,
enum_mode,
enum_padding_mode,
align_corners);
}
} // namespace phi
PD_REGISTER_KERNEL(grid_sample_grad,
GPU,
ALL_LAYOUT,
phi::GridSampleGradKernel,
float,
double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/grid_sample_kernel.h"
#include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/gpu/grid_sample_utils.h"
namespace phi {
template <typename T>
static __forceinline__ __device__ T Unnormalize(T coord,
int size,
bool align_corners) {
if (align_corners) {
return ((coord + 1.f) / 2) * (size - 1);
} else {
return ((coord + 1.f) * size - 1) / 2;
}
}
template <typename T>
static __forceinline__ __device__ T ClipIndexes(T in, int max_value) {
return min(static_cast<T>(max_value), max(in, static_cast<T>(0)));
}
template <typename T>
static __forceinline__ __device__ T ReflectIndexes(T in,
int twice_low,
int twice_high) {
if (twice_low == twice_high) {
return static_cast<T>(0);
}
T min = static_cast<T>(twice_low) / 2;
T span = static_cast<T>(twice_high - twice_low) / 2;
in = fabs(in - min);
T extra = fmod(in, span);
int flips = static_cast<int>(floor(in / span));
if (flips % 2 == 0) {
return extra + min;
} else {
return span - extra + min;
}
}
template <typename T>
static __forceinline__ __device__ T ComputePositions(T coord,
int size,
PaddingMode padding_mode,
bool align_corners) {
coord = Unnormalize<T>(coord, size, align_corners);
if (padding_mode == PaddingMode::border) {
coord = ClipIndexes(coord, size - 1);
} else if (padding_mode == PaddingMode::reflect) {
if (align_corners) {
coord = ReflectIndexes(coord, 0, 2 * (size - 1));
} else {
coord = ReflectIndexes(coord, -1, 2 * size - 1);
}
coord = ClipIndexes(coord, size - 1);
}
return coord;
}
template <typename T>
__global__ void GridSampleCudaKernel(const int nthreads,
int n,
int out_c,
int out_h,
int out_w,
int in_h,
int in_w,
const T* input,
const T* grid,
T* output,
const Mode mode,
const PaddingMode padding_mode,
bool align_corners) {
int inp_sN = out_c * in_h * in_w;
int inp_sC = in_h * in_w;
int inp_sH = in_w;
int inp_sW = 1;
int grid_sN = out_h * out_w * 2;
int grid_sH = out_w * 2;
int grid_sW = 2;
int grid_sCoor = 1;
int out_sN = out_c * out_h * out_w;
int out_sC = out_h * out_w;
int out_sH = out_w;
int out_sW = 1;
CUDA_KERNEL_LOOP(index, nthreads) {
const int w = index % out_w;
const int h = (index / out_w) % out_h;
const int n = index / (out_h * out_w);
const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
T ix = grid[grid_offset];
T iy = grid[grid_offset + grid_sCoor];
ix = ComputePositions(ix, in_w, padding_mode, align_corners);
iy = ComputePositions(iy, in_h, padding_mode, align_corners);
if (mode == Mode::bilinear) {
int ix_nw = static_cast<int>(floor(ix));
int iy_nw = static_cast<int>(floor(iy));
int ix_ne = ix_nw + 1;
int iy_ne = iy_nw;
int ix_sw = ix_nw;
int iy_sw = iy_nw + 1;
int ix_se = ix_nw + 1;
int iy_se = iy_nw + 1;
T nw = (ix_se - ix) * (iy_se - iy);
T ne = (ix - ix_sw) * (iy_sw - iy);
T sw = (ix_ne - ix) * (iy - iy_ne);
T se = (ix - ix_nw) * (iy - iy_nw);
auto inp_offset_NC = n * inp_sN;
auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
for (int c = 0; c < out_c;
++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) {
*out_ptr_NCHW = static_cast<T>(0);
if (InBounds(iy_nw, ix_nw, in_h, in_w)) {
*out_ptr_NCHW +=
input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW] * nw;
}
if (InBounds(iy_ne, ix_ne, in_h, in_w)) {
*out_ptr_NCHW +=
input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW] * ne;
}
if (InBounds(iy_sw, ix_sw, in_h, in_w)) {
*out_ptr_NCHW +=
input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW] * sw;
}
if (InBounds(iy_se, ix_se, in_h, in_w)) {
*out_ptr_NCHW +=
input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW] * se;
}
}
} else if (mode == Mode::nearest) {
int ix_nearest = static_cast<int>(std::nearbyint(ix));
int iy_nearest = static_cast<int>(std::nearbyint(iy));
auto inp_offset_NC = n * inp_sN;
auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
for (int c = 0; c < out_c;
++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) {
if (InBounds(iy_nearest, ix_nearest, in_h, in_w)) {
*out_ptr_NCHW =
input[inp_offset_NC + iy_nearest * inp_sH + ix_nearest * inp_sW];
} else {
*out_ptr_NCHW = static_cast<T>(0);
}
}
}
}
}
template <typename T, typename Context>
void GridSampleKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& grid,
const std::string& mode,
const std::string& padding_mode,
bool align_corners,
DenseTensor* out) {
PaddingMode enum_padding_mode;
Mode enum_mode;
if (padding_mode == "border") {
enum_padding_mode = PaddingMode::border;
} else if (padding_mode == "reflection") {
enum_padding_mode = PaddingMode::reflect;
} else {
enum_padding_mode = PaddingMode::zeros;
}
if (mode == "nearest") {
enum_mode = Mode::nearest;
} else {
enum_mode = Mode::bilinear;
}
const int n = grid.dims()[0];
const int out_h = grid.dims()[1];
const int out_w = grid.dims()[2];
const int c = x.dims()[1];
const int in_h = x.dims()[2];
const int in_w = x.dims()[3];
VLOG(3) << "n: " << n << "; c: " << c << "; out_h: " << out_h
<< "; out_w: " << out_w;
auto* output_data = dev_ctx.template Alloc<T>(out);
VLOG(3) << "out dims: " << out->dims()[0] << "; " << out->dims()[1] << "; "
<< out->dims()[2] << "; " << out->dims()[3];
int count = static_cast<int>(n * out_h * out_w);
auto cu_stream = dev_ctx.stream();
backends::gpu::GpuLaunchConfig config =
backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count);
GridSampleCudaKernel<
T><<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
count,
n,
c,
out_h,
out_w,
in_h,
in_w,
x.data<T>(),
grid.data<T>(),
output_data,
enum_mode,
enum_padding_mode,
align_corners);
}
} // namespace phi
PD_REGISTER_KERNEL(
grid_sample, GPU, ALL_LAYOUT, phi::GridSampleKernel, float, double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
namespace phi {
enum class Mode {
bilinear,
nearest,
};
enum class PaddingMode { zeros, border, reflect };
static __forceinline__ __device__ bool InBounds(int h, int w, int H, int W) {
return h >= 0 && h < H && w >= 0 && w < W;
}
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/index_select_grad_kernel.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/utils/data_type.h"
namespace phi {
using paddle::platform::PADDLE_CUDA_NUM_THREADS;
template <typename T, typename IndexT>
__global__ void index_select_grad_cuda_kernel(const T* output_grad,
T* input_grad,
const IndexT* index,
int64_t nums,
int64_t N,
int64_t stride,
int64_t size,
int64_t delta) {
int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= N) {
return;
}
int64_t pre_idx = idx / (stride * size);
int64_t dim_idx = idx % (stride * size) / stride;
IndexT src_dim_idx = index[dim_idx];
int64_t input_idx = idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride;
paddle::platform::CudaAtomicAdd(&input_grad[input_idx], output_grad[idx]);
}
template <typename T>
__global__ void index_select_grad_init(T* input_grad, int64_t N) {
int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= N) {
return;
}
input_grad[idx] = 0.0;
}
template <typename T, typename Context>
void IndexSelectGradKernel(const Context& ctx,
const DenseTensor& x,
const DenseTensor& index,
const DenseTensor& out_grad,
int dim,
DenseTensor* x_grad) {
auto* output_grad_data = out_grad.data<T>();
auto* in_grad_data = ctx.template Alloc<T>(x_grad);
auto input_dim = x_grad->dims();
auto output_dim = out_grad.dims();
dim = dim >= 0 ? dim : dim + input_dim.size();
auto stride_dim = phi::stride(input_dim);
int64_t stride = stride_dim[dim];
int64_t size = output_dim[dim];
int64_t delta = input_dim[dim] - size;
const auto& index_type = index.dtype();
bool index_type_match =
index_type == phi::DataType::INT64 || index_type == phi::DataType::INT32;
PADDLE_ENFORCE_EQ(index_type_match,
true,
phi::errors::InvalidArgument(
"Input(Index) holds the wrong type, it holds %s, but "
"desires to be %s or %s",
index_type,
phi::DataType::INT32,
phi::DataType::INT64));
int64_t numel = x_grad->numel();
int64_t index_nums = index.numel();
int64_t out_nums = out_grad.numel();
auto stream = ctx.stream();
index_select_grad_init<
T><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
PADDLE_CUDA_NUM_THREADS,
0,
stream>>>(in_grad_data, numel);
if (index_type == phi::DataType::INT64) {
const int64_t* index_data = index.data<int64_t>();
index_select_grad_cuda_kernel<T, int64_t><<<
(out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
PADDLE_CUDA_NUM_THREADS,
0,
stream>>>(output_grad_data,
in_grad_data,
index_data,
index_nums,
out_nums,
stride,
size,
delta);
phi::backends::gpu::GpuStreamSync(stream);
} else {
const int* index_data = index.data<int>();
index_select_grad_cuda_kernel<T, int><<<
(out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
PADDLE_CUDA_NUM_THREADS,
0,
stream>>>(output_grad_data,
in_grad_data,
index_data,
index_nums,
out_nums,
stride,
size,
delta);
phi::backends::gpu::GpuStreamSync(stream);
}
}
} // namespace phi
PD_REGISTER_KERNEL(index_select_grad,
GPU,
ALL_LAYOUT,
phi::IndexSelectGradKernel,
float,
double,
phi::dtype::float16,
int,
int64_t) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/index_select_kernel.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/utils/data_type.h"
namespace phi {
using paddle::platform::PADDLE_CUDA_NUM_THREADS;
template <typename T, typename IndexT>
__global__ void index_select_cuda_kernel(const T* input,
T* output,
const IndexT* index,
int64_t N,
int64_t stride,
int64_t size,
int64_t delta) {
int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= N) {
return;
}
int64_t pre_idx = idx / (stride * size);
int64_t dim_idx = idx % (stride * size) / stride;
IndexT src_dim_idx = index[dim_idx];
int64_t input_idx = idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride;
output[idx] = input[input_idx];
}
template <typename T, typename Context>
void IndexSelectKernel(const Context& ctx,
const DenseTensor& x,
const DenseTensor& index,
int dim,
DenseTensor* output) {
auto input_dim = x.dims();
auto output_dim = output->dims();
dim = dim >= 0 ? dim : dim + input_dim.size();
auto stride_dim = phi::stride(input_dim);
int64_t stride = stride_dim[dim];
int64_t size = output_dim[dim];
int64_t delta = input_dim[dim] - size;
const auto& index_type = index.dtype();
bool index_type_match =
index_type == phi::DataType::INT64 || index_type == phi::DataType::INT32;
PADDLE_ENFORCE_EQ(index_type_match,
true,
phi::errors::InvalidArgument(
"Input(Index) holds the wrong type, it holds %s, but "
"desires to be %s or %s",
index_type,
phi::DataType::INT32,
phi::DataType::INT64));
auto* in_data = x.data<T>();
T* out_data = ctx.template Alloc<T>(output);
int64_t numel = output->numel();
auto stream = ctx.stream();
if (index_type == phi::DataType::INT64) {
const int64_t* index_data = index.data<int64_t>();
index_select_cuda_kernel<T, int64_t><<<
(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
PADDLE_CUDA_NUM_THREADS,
0,
stream>>>(in_data, out_data, index_data, numel, stride, size, delta);
phi::backends::gpu::GpuStreamSync(stream);
} else {
const int* index_data = index.data<int>();
index_select_cuda_kernel<
T,
int><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
PADDLE_CUDA_NUM_THREADS,
0,
stream>>>(
in_data, out_data, index_data, numel, stride, size, delta);
phi::backends::gpu::GpuStreamSync(stream);
}
}
} // namespace phi
PD_REGISTER_KERNEL(index_select,
GPU,
ALL_LAYOUT,
phi::IndexSelectKernel,
float,
double,
phi::dtype::float16,
int,
int64_t) {}
...@@ -56,30 +56,6 @@ namespace phi { ...@@ -56,30 +56,6 @@ namespace phi {
* Kernels * Kernels
*/ */
template <typename T, typename Context>
void MeanRawKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
bool reduce_all,
DenseTensor* out) {
auto out_dtype = x.dtype();
phi::Reduce<T, kps::AddFunctor, kps::DivideFunctor>(
dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
}
template <typename T, typename Context>
void SumRawKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
bool reduce_all,
DataType out_dtype,
DenseTensor* out) {
phi::Reduce<T, kps::AddFunctor, kps::IdentityFunctor>(
dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
}
// Create the definition of Add // Create the definition of Add
DEFINE_CUDA_ELEMENTWISE_OP(Add) DEFINE_CUDA_ELEMENTWISE_OP(Add)
// Create the definition of Subtract // Create the definition of Subtract
...@@ -147,30 +123,3 @@ PD_REGISTER_KERNEL(multiply_raw, ...@@ -147,30 +123,3 @@ PD_REGISTER_KERNEL(multiply_raw,
complex64, complex64,
complex128, complex128,
bfloat16) {} bfloat16) {}
PD_REGISTER_KERNEL(sum_raw,
GPU,
ALL_LAYOUT,
phi::SumRawKernel,
bool,
float,
double,
float16,
bfloat16,
int16_t,
int,
int64_t,
complex64,
complex128) {
kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
}
PD_REGISTER_KERNEL(mean_raw,
GPU,
ALL_LAYOUT,
phi::MeanRawKernel,
float,
double,
bool,
float16,
int,
int64_t) {}
...@@ -28,7 +28,7 @@ ...@@ -28,7 +28,7 @@
#include "paddle/phi/kernels/funcs/compare_functors.h" #include "paddle/phi/kernels/funcs/compare_functors.h"
#include "paddle/phi/kernels/impl/matrix_rank_kernel_impl.h" #include "paddle/phi/kernels/impl/matrix_rank_kernel_impl.h"
#include "paddle/phi/kernels/math_kernel.h" #include "paddle/phi/kernels/math_kernel.h"
#include "paddle/phi/kernels/reduce_max_kernel.h" #include "paddle/phi/kernels/reduce_kernel.h"
namespace phi { namespace phi {
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/multiplex_grad_kernel.h"
#include "paddle/phi/api/lib/utils/tensor_utils.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
namespace phi {
template <typename T, typename Context>
void MultiplexGradKernel(const Context& ctx,
const DenseTensor& ids,
const DenseTensor& out_grad,
std::vector<DenseTensor*> ins_grad) {
size_t idx = -1UL;
for (size_t i = 0; i < ins_grad.size(); i++) {
if (ins_grad[i]) {
ctx.template Alloc<T>(ins_grad[i]);
auto t = phi::EigenVector<T>::Flatten(*ins_grad[i]);
t.device(*ctx.eigen_device()) = t.constant(static_cast<T>(0));
idx = i;
}
}
if (idx == -1UL) return;
auto rows = ins_grad[idx]->dims()[0];
auto cols = ins_grad[idx]->numel() / rows;
DenseTensor index_t_cpu;
paddle::framework::TensorCopySync(ids, phi::CPUPlace(), &index_t_cpu);
auto* index = index_t_cpu.data<int32_t>();
auto stream = ctx.stream();
for (auto i = 0; i < rows; i++) {
size_t k = static_cast<size_t>(index[i]);
if (ins_grad[k]) {
paddle::memory::Copy(ctx.GetPlace(),
ins_grad[k]->data<T>() + i * cols,
ctx.GetPlace(),
out_grad.data<T>() + i * cols,
cols * sizeof(T),
stream);
}
}
}
} // namespace phi
PD_REGISTER_KERNEL(multiplex_grad,
GPU,
ALL_LAYOUT,
phi::MultiplexGradKernel,
float,
double,
int,
int64_t) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/multiplex_kernel.h"
#include "paddle/phi/api/lib/utils/tensor_utils.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
template <typename T, typename Context>
void MultiplexKernel(const Context& ctx,
const std::vector<const DenseTensor*>& ins,
const DenseTensor& ids,
DenseTensor* out) {
ctx.template Alloc<T>(out);
for (size_t i = 0; i < ins.size(); ++i) {
PADDLE_ENFORCE_GT(
ins[i]->numel(),
0,
errors::OutOfRange(
"indexing will be out of bounds with size 0 for the %d-th input.",
i));
}
auto rows = ins[0]->dims()[0];
auto cols = ins[0]->numel() / rows;
DenseTensor index_t_cpu;
paddle::framework::TensorCopySync(ids, phi::CPUPlace(), &index_t_cpu);
auto* index = index_t_cpu.data<int32_t>();
auto stream = ctx.stream();
for (auto i = 0; i < rows; i++) {
int32_t k = index[i];
PADDLE_ENFORCE_GE(
k, 0, errors::PreconditionNotMet("index must be nonnegative."));
PADDLE_ENFORCE_LT(static_cast<size_t>(k),
ins.size(),
errors::PreconditionNotMet(
"index exceeds the number of candidate tensors."));
paddle::memory::Copy(ctx.GetPlace(),
out->data<T>() + i * cols,
ctx.GetPlace(),
ins[k]->data<T>() + i * cols,
cols * sizeof(T),
stream);
}
}
} // namespace phi
PD_REGISTER_KERNEL(multiplex,
GPU,
ALL_LAYOUT,
phi::MultiplexKernel,
float,
double,
int,
int64_t) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/reduce_all_kernel.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/gpu/reduce.h"
namespace phi {
template <typename T, typename Context>
void AllRawKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
bool reduce_all,
DenseTensor* out) {
auto out_dtype = x.dtype();
phi::Reduce<T, kps::LogicalAndFunctor, kps::IdentityFunctor>(
dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
}
} // namespace phi
PD_REGISTER_KERNEL(all_raw, GPU, ALL_LAYOUT, phi::AllRawKernel, bool) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/reduce_kernel.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/gpu/reduce.h"
namespace phi {
template <typename T, typename Context>
void MeanRawKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
bool reduce_all,
DenseTensor* out) {
auto out_dtype = x.dtype();
phi::Reduce<T, kps::AddFunctor, kps::DivideFunctor>(
dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
}
template <typename T, typename Context>
void SumRawKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
bool reduce_all,
DataType out_dtype,
DenseTensor* out) {
phi::Reduce<T, kps::AddFunctor, kps::IdentityFunctor>(
dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
}
template <typename T, typename Context>
void ProdRawKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
bool reduce_all,
DenseTensor* out) {
auto out_dtype = x.dtype();
phi::Reduce<T, kps::MulFunctor, kps::IdentityFunctor>(
dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
}
template <typename T, typename Context>
void MaxRawKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
bool reduce_all,
DenseTensor* out) {
auto out_dtype = x.dtype();
phi::Reduce<T, kps::MaxFunctor, kps::IdentityFunctor>(
dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
}
template <typename T, typename Context>
void MinRawKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
bool reduce_all,
DenseTensor* out) {
auto out_dtype = x.dtype();
phi::Reduce<T, kps::MinFunctor, kps::IdentityFunctor>(
dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
}
template <typename T, typename Context>
void AllRawKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
bool reduce_all,
DenseTensor* out) {
auto out_dtype = x.dtype();
phi::Reduce<T, kps::LogicalAndFunctor, kps::IdentityFunctor>(
dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
}
template <typename T, typename Context>
void AnyRawKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
bool reduce_all,
DenseTensor* out) {
auto out_dtype = x.dtype();
phi::Reduce<T, kps::LogicalOrFunctor, kps::IdentityFunctor>(
dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
}
} // namespace phi
using float16 = phi::dtype::float16;
using bfloat16 = phi::dtype::bfloat16;
using complex64 = ::phi::dtype::complex<float>;
using complex128 = ::phi::dtype::complex<double>;
PD_REGISTER_KERNEL(sum_raw,
GPU,
ALL_LAYOUT,
phi::SumRawKernel,
bool,
float,
double,
float16,
bfloat16,
int16_t,
int,
int64_t,
complex64,
complex128) {
kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
}
PD_REGISTER_KERNEL(mean_raw,
GPU,
ALL_LAYOUT,
phi::MeanRawKernel,
float,
double,
bool,
float16,
int,
int64_t) {}
PD_REGISTER_KERNEL(prod_raw,
GPU,
ALL_LAYOUT,
phi::ProdRawKernel,
float,
double,
int,
int64_t) {}
PD_REGISTER_KERNEL(
max_raw, GPU, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {}
PD_REGISTER_KERNEL(
min_raw, GPU, ALL_LAYOUT, phi::MinRawKernel, float, double, int, int64_t) {}
PD_REGISTER_KERNEL(all_raw, GPU, ALL_LAYOUT, phi::AllRawKernel, bool) {}
PD_REGISTER_KERNEL(any_raw, GPU, ALL_LAYOUT, phi::AnyRawKernel, bool) {}
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/phi/kernels/reduce_max_kernel.h" #include "paddle/phi/kernels/reduce_kernel.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/gpu/reduce.h" #include "paddle/phi/kernels/gpu/reduce.h"
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/reduce_min_kernel.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/gpu/reduce.h"
namespace phi {
template <typename T, typename Context>
void MinRawKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
bool reduce_all,
DenseTensor* out) {
auto out_dtype = x.dtype();
phi::Reduce<T, kps::MinFunctor, kps::IdentityFunctor>(
dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
}
} // namespace phi
PD_REGISTER_KERNEL(
min_raw, GPU, ALL_LAYOUT, phi::MinRawKernel, float, double, int, int64_t) {}
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
Licensed under the Apache License, Version 2.0 (the "License"); #include "paddle/phi/kernels/roi_align_grad_kernel.h"
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/funcs/math_function.h"
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <vector>
#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/operators/roi_align_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
namespace paddle { namespace phi {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
static constexpr int kNumCUDAThreads = 512; static constexpr int kNumCUDAThreads = 512;
static constexpr int kNumMaxinumNumBlocks = 4096; static constexpr int kNumMaxinumNumBlocks = 4096;
...@@ -34,10 +36,18 @@ static inline int NumBlocks(const int N) { ...@@ -34,10 +36,18 @@ static inline int NumBlocks(const int N) {
} }
template <class T> template <class T>
__device__ void BilinearInterpolateGradient(const int height, const int width, __device__ void BilinearInterpolateGradient(const int height,
T y, T x, T* w1, T* w2, T* w3, const int width,
T* w4, int* x_low, int* x_high, T y,
int* y_low, int* y_high) { T x,
T* w1,
T* w2,
T* w3,
T* w4,
int* x_low,
int* x_high,
int* y_low,
int* y_high) {
if (y < -1.0 || y > height || x < -1.0 || x > width) { if (y < -1.0 || y > height || x < -1.0 || x > width) {
return; return;
} }
...@@ -66,12 +76,20 @@ __device__ void BilinearInterpolateGradient(const int height, const int width, ...@@ -66,12 +76,20 @@ __device__ void BilinearInterpolateGradient(const int height, const int width,
} }
template <typename T> template <typename T>
__global__ void GPUROIAlignBackward( __global__ void GPURoiAlignBackward(const int nthreads,
const int nthreads, const T* input_rois, const T* out_grad, const T* input_rois,
const int num_rois, const float spatial_scale, const int channels, const T* out_grad,
const int height, const int width, const int pooled_height, const int num_rois,
const int pooled_width, const int sampling_ratio, int* roi_batch_id_data, const float spatial_scale,
T* input_grad, const bool continuous_coordinate) { const int channels,
const int height,
const int width,
const int pooled_height,
const int pooled_width,
const int sampling_ratio,
int* roi_batch_id_data,
T* input_grad,
const bool continuous_coordinate) {
CUDA_KERNEL_LOOP(i, nthreads) { CUDA_KERNEL_LOOP(i, nthreads) {
int pw = i % pooled_width; int pw = i % pooled_width;
int ph = (i / pooled_width) % pooled_height; int ph = (i / pooled_width) % pooled_height;
...@@ -119,109 +137,124 @@ __global__ void GPUROIAlignBackward( ...@@ -119,109 +137,124 @@ __global__ void GPUROIAlignBackward(
static_cast<T>(roi_bin_grid_w); static_cast<T>(roi_bin_grid_w);
T w1 = 0, w2 = 0, w3 = 0, w4 = 0; T w1 = 0, w2 = 0, w3 = 0, w4 = 0;
int x_low = -1, x_high = -1, y_low = -1, y_high = -1; int x_low = -1, x_high = -1, y_low = -1, y_high = -1;
BilinearInterpolateGradient(height, width, y, x, &w1, &w2, &w3, &w4, BilinearInterpolateGradient(height,
&x_low, &x_high, &y_low, &y_high); width,
y,
x,
&w1,
&w2,
&w3,
&w4,
&x_low,
&x_high,
&y_low,
&y_high);
T diff1 = out_grad_this_bin * w1 / count; T diff1 = out_grad_this_bin * w1 / count;
T diff2 = out_grad_this_bin * w2 / count; T diff2 = out_grad_this_bin * w2 / count;
T diff3 = out_grad_this_bin * w3 / count; T diff3 = out_grad_this_bin * w3 / count;
T diff4 = out_grad_this_bin * w4 / count; T diff4 = out_grad_this_bin * w4 / count;
if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
platform::CudaAtomicAdd(offset_input_grad + y_low * width + x_low, paddle::platform::CudaAtomicAdd(
diff1); offset_input_grad + y_low * width + x_low, diff1);
platform::CudaAtomicAdd(offset_input_grad + y_low * width + x_high, paddle::platform::CudaAtomicAdd(
diff2); offset_input_grad + y_low * width + x_high, diff2);
platform::CudaAtomicAdd(offset_input_grad + y_high * width + x_low, paddle::platform::CudaAtomicAdd(
diff3); offset_input_grad + y_high * width + x_low, diff3);
platform::CudaAtomicAdd(offset_input_grad + y_high * width + x_high, paddle::platform::CudaAtomicAdd(
diff4); offset_input_grad + y_high * width + x_high, diff4);
} }
} }
} }
} }
} }
template <typename Place, typename T> template <typename T, typename Context>
class GPUROIAlignGradOpKernel : public framework::OpKernel<T> { void RoiAlignGradKernel(const Context& dev_ctx,
public: const DenseTensor& x,
void Compute(const framework::ExecutionContext& ctx) const override { const DenseTensor& boxes,
auto* in = ctx.Input<Tensor>("X"); paddle::optional<const DenseTensor&> boxes_num,
auto* rois = ctx.Input<LoDTensor>("ROIs"); const DenseTensor& out_grad,
int pooled_height,
auto* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out")); int pooled_width,
auto* in_grad = ctx.Output<Tensor>(framework::GradVarName("X")); float spatial_scale,
int sampling_ratio,
bool aligned,
DenseTensor* dx) {
int rois_num = boxes.dims()[0];
int channels = x.dims()[1];
int height = x.dims()[2];
int width = x.dims()[3];
auto pooled_height = ctx.Attr<int>("pooled_height"); if (!dx) {
auto pooled_width = ctx.Attr<int>("pooled_width");
auto spatial_scale = ctx.Attr<float>("spatial_scale");
auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
auto aligned = ctx.Attr<bool>("aligned");
int rois_num = rois->dims()[0];
int channels = in->dims()[1];
int height = in->dims()[2];
int width = in->dims()[3];
if (!in_grad) {
return; return;
} }
Tensor roi_batch_id_list;
roi_batch_id_list.Resize({rois_num}); DenseTensor box_batch_id_list;
auto cplace = platform::CPUPlace(); box_batch_id_list.Resize({rois_num});
int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace); int* box_batch_size = dev_ctx.template HostAlloc<int>(&box_batch_id_list);
auto& dev_ctx = ctx.cuda_device_context(); auto cplace = phi::CPUPlace();
auto gplace = ctx.GetPlace(); auto gplace = dev_ctx.GetPlace();
if (ctx.HasInput("RoisNum")) { if (boxes_num) {
auto* rois_num_t = ctx.Input<Tensor>("RoisNum"); int boxes_batch_size = boxes_num->numel();
int rois_batch_size = rois_num_t->numel(); std::vector<int> boxes_num_list(boxes_batch_size);
std::vector<int> rois_num_list(rois_batch_size); paddle::memory::Copy(cplace,
memory::Copy(cplace, rois_num_list.data(), gplace, boxes_num_list.data(),
rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0); gplace,
boxes_num->data<int>(),
sizeof(int) * boxes_batch_size,
0);
int start = 0; int start = 0;
for (int n = 0; n < rois_batch_size; ++n) { for (int n = 0; n < boxes_batch_size; ++n) {
for (size_t i = start; i < start + rois_num_list[n]; ++i) { for (size_t i = start; i < start + boxes_num_list[n]; ++i) {
roi_batch_id_data[i] = n; box_batch_size[i] = n;
} }
start += rois_num_list[n]; start += boxes_num_list[n];
} }
} else { } else {
auto rois_lod = rois->lod().back(); auto boxes_lod = boxes.lod().back();
int rois_batch_size = rois_lod.size() - 1; int boxes_batch_size = boxes_lod.size() - 1;
for (int n = 0; n < rois_batch_size; ++n) { for (int n = 0; n < boxes_batch_size; ++n) {
for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { for (size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) {
roi_batch_id_data[i] = n; box_batch_size[i] = n;
} }
} }
} }
auto roi_ptr = auto roi_ptr =
memory::Alloc(dev_ctx, roi_batch_id_list.numel() * sizeof(int)); paddle::memory::Alloc(dev_ctx, box_batch_id_list.numel() * sizeof(int));
int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr()); int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
int bytes = roi_batch_id_list.numel() * sizeof(int); int bytes = box_batch_id_list.numel() * sizeof(int);
memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes, paddle::memory::Copy(
dev_ctx.stream()); gplace, roi_id_data, cplace, box_batch_size, bytes, dev_ctx.stream());
in_grad->mutable_data<T>(ctx.GetPlace()); dev_ctx.template Alloc<T>(dx);
phi::funcs::SetConstant<Place, T> set_zero;
set_zero(dev_ctx, in_grad, static_cast<T>(0)); phi::funcs::SetConstant<Context, T> set_zero;
set_zero(dev_ctx, dx, static_cast<T>(0));
int output_grad_size = out_grad->numel();
int output_grad_size = out_grad.numel();
int blocks = NumBlocks(output_grad_size); int blocks = NumBlocks(output_grad_size);
int threads = kNumCUDAThreads; int threads = kNumCUDAThreads;
if (output_grad_size > 0) { if (output_grad_size > 0) {
GPUROIAlignBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>( GPURoiAlignBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
output_grad_size, rois->data<T>(), out_grad->data<T>(), rois_num, output_grad_size,
spatial_scale, channels, height, width, pooled_height, pooled_width, boxes.data<T>(),
sampling_ratio, roi_id_data, in_grad->mutable_data<T>(ctx.GetPlace()), out_grad.data<T>(),
rois_num,
spatial_scale,
channels,
height,
width,
pooled_height,
pooled_width,
sampling_ratio,
roi_id_data,
dx->data<T>(),
aligned); aligned);
} }
} }
};
} // namespace operators } // namespace phi
} // namespace paddle
namespace ops = paddle::operators; PD_REGISTER_KERNEL(
REGISTER_OP_CUDA_KERNEL( roi_align_grad, GPU, ALL_LAYOUT, phi::RoiAlignGradKernel, float, double) {}
roi_align_grad,
ops::GPUROIAlignGradOpKernel<paddle::platform::CUDADeviceContext, float>,
ops::GPUROIAlignGradOpKernel<paddle::platform::CUDADeviceContext, double>);
...@@ -71,7 +71,7 @@ __device__ T BilinearInterpolate( ...@@ -71,7 +71,7 @@ __device__ T BilinearInterpolate(
} }
template <class T> template <class T>
__global__ void GPUROIAlignForward(const int nthreads, __global__ void GPURoiAlignForward(const int nthreads,
const T* input_data, const T* input_data,
const T* input_rois, const T* input_rois,
const float spatial_scale, const float spatial_scale,
...@@ -137,7 +137,7 @@ __global__ void GPUROIAlignForward(const int nthreads, ...@@ -137,7 +137,7 @@ __global__ void GPUROIAlignForward(const int nthreads,
} }
template <typename T, typename Context> template <typename T, typename Context>
void ROIAlignKernel(const Context& dev_ctx, void RoiAlignKernel(const Context& dev_ctx,
const DenseTensor& x, const DenseTensor& x,
const DenseTensor& boxes, const DenseTensor& boxes,
paddle::optional<const DenseTensor&> boxes_num, paddle::optional<const DenseTensor&> boxes_num,
...@@ -233,7 +233,7 @@ void ROIAlignKernel(const Context& dev_ctx, ...@@ -233,7 +233,7 @@ void ROIAlignKernel(const Context& dev_ctx,
int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr()); int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
paddle::memory::Copy( paddle::memory::Copy(
gplace, roi_id_data, cplace, roi_batch_id_data, bytes, dev_ctx.stream()); gplace, roi_id_data, cplace, roi_batch_id_data, bytes, dev_ctx.stream());
GPUROIAlignForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>( GPURoiAlignForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
output_size, output_size,
x.data<T>(), x.data<T>(),
boxes.data<T>(), boxes.data<T>(),
...@@ -252,4 +252,4 @@ void ROIAlignKernel(const Context& dev_ctx, ...@@ -252,4 +252,4 @@ void ROIAlignKernel(const Context& dev_ctx,
} // namespace phi } // namespace phi
PD_REGISTER_KERNEL( PD_REGISTER_KERNEL(
roi_align, GPU, ALL_LAYOUT, phi::ROIAlignKernel, float, double) {} roi_align, GPU, ALL_LAYOUT, phi::RoiAlignKernel, float, double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/roll_grad_kernel.h"
#include "paddle/phi/common/complex.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/gpu/roll_kernel_impl.h"
namespace phi {
using paddle::platform::PADDLE_CUDA_NUM_THREADS;
template <typename T, typename Context>
void RollGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& out_grad,
const ScalarArray& shifts,
const std::vector<int64_t>& axis,
DenseTensor* x_grad) {
auto* in_data = out_grad.data<T>();
T* out_data = dev_ctx.template Alloc<T>(x_grad);
int64_t numel = out_grad.numel();
auto stream = dev_ctx.stream();
auto shifts_data = shifts.GetData();
size_t nums = shifts_data.size();
auto input_dim = out_grad.dims();
auto stride_dim = phi::stride(input_dim);
std::vector<int64_t> strides(nums), sizes(nums);
if (axis.size() == 0) {
strides[0] = 1;
sizes[0] = numel;
shifts_data[0] = ((-shifts_data[0]) % numel + numel) % numel;
} else {
for (size_t i = 0; i < nums; i++) {
int dim = axis[i] >= 0 ? axis[i] : axis[i] + input_dim.size();
int64_t size = input_dim[dim];
if (size != 0) {
shifts_data[i] = ((-shifts_data[i]) % size + size) % size;
strides[i] = stride_dim[dim];
sizes[i] = size;
}
}
}
switch (nums) {
CALL_ROLL_CUDA_KERNEL(1);
CALL_ROLL_CUDA_KERNEL(2);
CALL_ROLL_CUDA_KERNEL(3);
CALL_ROLL_CUDA_KERNEL(4);
CALL_ROLL_CUDA_KERNEL(5);
CALL_ROLL_CUDA_KERNEL(6);
CALL_ROLL_CUDA_KERNEL(7);
CALL_ROLL_CUDA_KERNEL(8);
CALL_ROLL_CUDA_KERNEL(9);
default:
PADDLE_THROW(phi::errors::InvalidArgument(
"shifts.size() should be less than 10, But received shifts.size() "
"= %d",
shifts_data.size()));
}
}
} // namespace phi
PD_REGISTER_KERNEL(roll_grad,
GPU,
ALL_LAYOUT,
phi::RollGradKernel,
float,
double,
int,
int64_t,
phi::dtype::complex<float>,
phi::dtype::complex<double>) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/roll_kernel.h"
#include "paddle/phi/common/complex.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/utils/array.h"
#include "paddle/phi/kernels/gpu/roll_kernel_impl.h"
namespace phi {
using paddle::platform::PADDLE_CUDA_NUM_THREADS;
template <typename T, typename Context>
void RollKernel(const Context& dev_ctx,
const DenseTensor& x,
const ScalarArray& shifts,
const std::vector<int64_t>& axis,
DenseTensor* out) {
auto* in_data = x.data<T>();
T* out_data = dev_ctx.template Alloc<T>(out);
int64_t numel = x.numel();
auto stream = dev_ctx.stream();
auto shifts_data = shifts.GetData();
size_t nums = shifts_data.size();
auto input_dim = x.dims();
auto stride_dim = phi::stride(input_dim);
std::vector<int64_t> strides(nums), sizes(nums);
if (axis.size() == 0) {
strides[0] = 1;
sizes[0] = numel;
shifts_data[0] = (shifts_data[0] % numel + numel) % numel;
} else {
for (size_t i = 0; i < nums; i++) {
int dim = axis[i] >= 0 ? axis[i] : axis[i] + input_dim.size();
int64_t size = input_dim[dim];
if (size != 0) {
shifts_data[i] = (shifts_data[i] % size + size) % size;
strides[i] = stride_dim[dim];
sizes[i] = size;
}
}
}
switch (nums) {
CALL_ROLL_CUDA_KERNEL(1);
CALL_ROLL_CUDA_KERNEL(2);
CALL_ROLL_CUDA_KERNEL(3);
CALL_ROLL_CUDA_KERNEL(4);
CALL_ROLL_CUDA_KERNEL(5);
CALL_ROLL_CUDA_KERNEL(6);
CALL_ROLL_CUDA_KERNEL(7);
CALL_ROLL_CUDA_KERNEL(8);
CALL_ROLL_CUDA_KERNEL(9);
default:
PADDLE_THROW(phi::errors::InvalidArgument(
"shifts.size() should be less than 10, But received shifts.size() "
"= %d",
shifts_data.size()));
}
}
} // namespace phi
PD_REGISTER_KERNEL(roll,
GPU,
ALL_LAYOUT,
phi::RollKernel,
float,
double,
int,
int64_t,
phi::dtype::complex<float>,
phi::dtype::complex<double>) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/core/utils/array.h"
#include "paddle/phi/kernels/primitive/kernel_primitives.h"
namespace phi {
using paddle::platform::PADDLE_CUDA_NUM_THREADS;
template <typename T, size_t Rank>
__global__ void RollCudaKernel(const T* input,
T* output,
int64_t N,
phi::Array<int64_t, Rank> shifts,
phi::Array<int64_t, Rank> strides,
phi::Array<int64_t, Rank> sizes) {
int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= N) {
return;
}
int64_t output_idx = idx;
int64_t new_dim_idx = 0;
#pragma unroll
for (size_t i = 0; i < Rank; i++) {
new_dim_idx = (idx / strides[i]) % sizes[i] + shifts[i];
if (new_dim_idx >= sizes[i]) {
output_idx += (shifts[i] - sizes[i]) * strides[i];
} else {
output_idx += shifts[i] * strides[i];
}
}
output[output_idx] = input[idx];
}
#define CALL_ROLL_CUDA_KERNEL(N) \
case N: { \
phi::Array<int64_t, N> _strides; \
phi::Array<int64_t, N> _shifts; \
phi::Array<int64_t, N> _sizes; \
for (size_t idx = 0; idx < N; ++idx) { \
_strides[idx] = strides[idx]; \
_shifts[idx] = shifts_data[idx]; \
_sizes[idx] = sizes[idx]; \
} \
RollCudaKernel< \
T, \
N><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, \
PADDLE_CUDA_NUM_THREADS, \
0, \
stream>>>(in_data, out_data, numel, _shifts, _strides, _sizes); \
break; \
}
} // namespace phi
...@@ -12,32 +12,18 @@ ...@@ -12,32 +12,18 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/phi/kernels/reduce_prod_kernel.h" #include "paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/gpu/reduce.h"
namespace phi { PD_REGISTER_KERNEL(tril_triu_grad,
template <typename T, typename Context>
void ReduceProdKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
bool reduce_all,
DenseTensor* out) {
auto out_dtype = x.dtype();
phi::Reduce<T, kps::MulFunctor, kps::IdentityFunctor>(
dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
}
} // namespace phi
PD_REGISTER_KERNEL(reduce_prod,
GPU, GPU,
ALL_LAYOUT, ALL_LAYOUT,
phi::ReduceProdKernel, phi::TrilTriuGradKernel,
bool,
float, float,
double, double,
int, int,
int64_t) {} int64_t,
phi::dtype::float16) {}
...@@ -12,25 +12,18 @@ ...@@ -12,25 +12,18 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/phi/kernels/reduce_any_kernel.h" #include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/gpu/reduce.h"
namespace phi { PD_REGISTER_KERNEL(tril_triu,
GPU,
template <typename T, typename Context> ALL_LAYOUT,
void AnyRawKernel(const Context& dev_ctx, phi::TrilTriuKernel,
const DenseTensor& x, bool,
const std::vector<int64_t>& dims, float,
bool keep_dim, double,
bool reduce_all, int,
DenseTensor* out) { int64_t,
auto out_dtype = x.dtype(); phi::dtype::float16) {}
phi::Reduce<T, kps::LogicalOrFunctor, kps::IdentityFunctor>(
dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
}
} // namespace phi
PD_REGISTER_KERNEL(any_raw, GPU, ALL_LAYOUT, phi::AnyRawKernel, bool) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void GridSampleGradKernel(const Context &dev_ctx,
const DenseTensor &x,
const DenseTensor &grid,
const DenseTensor &out_grid,
const std::string &mode,
const std::string &padding_mode,
bool align_corners,
DenseTensor *x_grad,
DenseTensor *grid_grad);
} // namespace phi
...@@ -14,22 +14,19 @@ ...@@ -14,22 +14,19 @@
#pragma once #pragma once
#include <string>
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
namespace phi { namespace phi {
template <typename T, typename Context> template <typename T, typename Context>
void MaxRawKernel(const Context& dev_ctx, void GridSampleKernel(const Context &dev_ctx,
const DenseTensor& x, const DenseTensor &x,
const std::vector<int64_t>& dims, const DenseTensor &grid,
bool keep_dim, const std::string &mode,
bool reduce_all, const std::string &padding_mode,
DenseTensor* out); bool align_corners,
DenseTensor *out);
template <typename T, typename Context>
void MaxKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
DenseTensor* out);
} // namespace phi } // namespace phi
...@@ -202,4 +202,24 @@ void TanhTripleGradKernel(const Context& dev_ctx, ...@@ -202,4 +202,24 @@ void TanhTripleGradKernel(const Context& dev_ctx,
d_ddx); // output d_ddx); // output
} }
template <typename T, typename Context>
void EluDoubleGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& dout,
const DenseTensor& ddx,
float alpha,
DenseTensor* dx,
DenseTensor* ddout) {
if (dx) {
dx->Resize(x.dims());
dev_ctx.template Alloc<T>(dx);
}
if (ddout) {
dev_ctx.template Alloc<T>(ddout);
}
funcs::ELUGradGradFunctor<T> functor;
functor.alpha = alpha;
functor(dev_ctx, &x, &ddx, ddout, &dout, dx);
}
} // namespace phi } // namespace phi
...@@ -24,13 +24,12 @@ ...@@ -24,13 +24,12 @@
#include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/common_shape.h" #include "paddle/phi/kernels/funcs/common_shape.h"
#include "paddle/phi/kernels/funcs/complex_functors.h" #include "paddle/phi/kernels/funcs/complex_functors.h"
#include "paddle/phi/kernels/funcs/for_range.h"
#include "paddle/phi/kernels/funcs/matrix_reduce.h" #include "paddle/phi/kernels/funcs/matrix_reduce.h"
#include "paddle/phi/kernels/funcs/tril_triu_compute.h"
#include "paddle/phi/kernels/math_kernel.h" #include "paddle/phi/kernels/math_kernel.h"
#include "paddle/phi/kernels/transpose_kernel.h" #include "paddle/phi/kernels/transpose_kernel.h"
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/operators/tril_triu_op.h"
namespace phi { namespace phi {
template <typename T, typename Context> template <typename T, typename Context>
...@@ -115,7 +114,7 @@ void CholeskySolveGradKernel(const Context& dev_ctx, ...@@ -115,7 +114,7 @@ void CholeskySolveGradKernel(const Context& dev_ctx,
const auto H = y_bst_dims_vec[y_bst_ndim - 2]; const auto H = y_bst_dims_vec[y_bst_ndim - 2];
const auto W = y_bst_dims_vec[y_bst_ndim - 1]; const auto W = y_bst_dims_vec[y_bst_ndim - 1];
phi::funcs::ForRange<Context> y_for_range(dev_ctx, dy_bst.numel()); phi::funcs::ForRange<Context> y_for_range(dev_ctx, dy_bst.numel());
paddle::operators::TrilTriuCompute<T> tril_triu_functor( phi::funcs::TrilTriuCompute<T> tril_triu_functor(
dy_bst.data<T>(), 0, !upper, H, W, dy_bst_upper.data<T>()); dy_bst.data<T>(), 0, !upper, H, W, dy_bst_upper.data<T>());
y_for_range(tril_triu_functor); y_for_range(tril_triu_functor);
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include <unsupported/Eigen/SpecialFunctions>
#include "paddle/phi/kernels/funcs/for_range.h" #include "paddle/phi/kernels/funcs/for_range.h"
namespace phi { namespace phi {
template <typename T> template <typename T>
......
...@@ -21,12 +21,11 @@ ...@@ -21,12 +21,11 @@
#include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/common_shape.h" #include "paddle/phi/kernels/funcs/common_shape.h"
#include "paddle/phi/kernels/funcs/complex_functors.h" #include "paddle/phi/kernels/funcs/complex_functors.h"
#include "paddle/phi/kernels/funcs/for_range.h"
#include "paddle/phi/kernels/funcs/matrix_reduce.h" #include "paddle/phi/kernels/funcs/matrix_reduce.h"
#include "paddle/phi/kernels/funcs/tril_triu_compute.h"
#include "paddle/phi/kernels/triangular_solve_kernel.h" #include "paddle/phi/kernels/triangular_solve_kernel.h"
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/operators/tril_triu_op.h"
namespace phi { namespace phi {
template <typename T, typename Context> template <typename T, typename Context>
...@@ -119,7 +118,7 @@ void TriangularSolveGradKernel(const Context& dev_ctx, ...@@ -119,7 +118,7 @@ void TriangularSolveGradKernel(const Context& dev_ctx,
const auto H = dims[dims.size() - 2]; const auto H = dims[dims.size() - 2];
const auto W = dims[dims.size() - 1]; const auto W = dims[dims.size() - 1];
phi::funcs::ForRange<Context> x_for_range(dev_ctx, dx_bst.numel()); phi::funcs::ForRange<Context> x_for_range(dev_ctx, dx_bst.numel());
paddle::operators::TrilTriuCompute<T> tril_triu_functor( phi::funcs::TrilTriuCompute<T> tril_triu_functor(
dx_bst.data<T>(), unitriangular, !upper, H, W, dx_bst_upper.data<T>()); dx_bst.data<T>(), unitriangular, !upper, H, W, dx_bst_upper.data<T>());
x_for_range(tril_triu_functor); x_for_range(tril_triu_functor);
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/kernels/tril_triu_grad_kernel.h"
#include "paddle/phi/kernels/funcs/for_range.h"
#include "paddle/phi/kernels/funcs/tril_triu_compute.h"
namespace phi {
template <typename T, typename Context>
void TrilTriuGradKernel(const Context& ctx,
const DenseTensor& out_grad,
int diagonal,
bool lower,
DenseTensor* x_grad) {
const auto* dout_data = out_grad.data<T>();
auto* dx_data = ctx.template Alloc<T>(x_grad);
const auto& dims = out_grad.dims();
const auto H = dims[dims.size() - 2];
const auto W = dims[dims.size() - 1];
phi::funcs::ForRange<Context> for_range(
ctx, static_cast<size_t>(out_grad.numel()));
phi::funcs::TrilTriuCompute<T> tril_triu_grad_computer(
dout_data, diagonal, lower, H, W, dx_data);
for_range(tril_triu_grad_computer);
}
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/kernels/tril_triu_kernel.h"
#include "paddle/phi/kernels/funcs/for_range.h"
#include "paddle/phi/kernels/funcs/tril_triu_compute.h"
namespace phi {
template <typename T, typename Context>
void TrilTriuKernel(const Context& ctx,
const DenseTensor& x,
int diagonal,
bool lower,
DenseTensor* out) {
const auto* x_data = x.data<T>();
auto* out_data = ctx.template Alloc<T>(out);
const auto& dims = x.dims();
const auto H = dims[dims.size() - 2];
const auto W = dims[dims.size() - 1];
phi::funcs::ForRange<Context> for_range(ctx, static_cast<size_t>(x.numel()));
phi::funcs::TrilTriuCompute<T> tril_triu_computer(
x_data, diagonal, lower, H, W, out_data);
for_range(tril_triu_computer);
}
} // namespace phi
...@@ -19,17 +19,11 @@ ...@@ -19,17 +19,11 @@
namespace phi { namespace phi {
template <typename T, typename Context> template <typename T, typename Context>
void AllRawKernel(const Context& dev_ctx, void IndexSelectGradKernel(const Context& ctx,
const DenseTensor& x, const DenseTensor& x,
const std::vector<int64_t>& dims, const DenseTensor& index,
bool keep_dim, const DenseTensor& out_grad,
bool reduce_all, int dim,
DenseTensor* out); DenseTensor* x_grad);
template <typename T, typename Context>
void AllKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
DenseTensor* out);
} // namespace phi } // namespace phi
...@@ -19,17 +19,10 @@ ...@@ -19,17 +19,10 @@
namespace phi { namespace phi {
template <typename T, typename Context> template <typename T, typename Context>
void AnyRawKernel(const Context& dev_ctx, void IndexSelectKernel(const Context& ctx,
const DenseTensor& x, const DenseTensor& x,
const std::vector<int64_t>& dims, const DenseTensor& index,
bool keep_dim, int dim,
bool reduce_all, DenseTensor* output);
DenseTensor* out);
template <typename T, typename Context>
void AnyKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
DenseTensor* out);
} // namespace phi } // namespace phi
...@@ -19,27 +19,6 @@ ...@@ -19,27 +19,6 @@
namespace phi { namespace phi {
template <typename T, typename Context>
void MeanKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
DenseTensor* out) {
bool reduce_all = false;
MeanRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
}
template <typename T, typename Context>
void SumKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
DataType out_dtype,
bool keep_dim,
DenseTensor* out) {
bool reduce_all = false;
SumRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out_dtype, out);
}
template <typename T, typename Context> template <typename T, typename Context>
void AddKernel(const Context& dev_ctx, void AddKernel(const Context& dev_ctx,
const DenseTensor& x, const DenseTensor& x,
...@@ -81,25 +60,6 @@ void MultiplyKernel(const Context& dev_ctx, ...@@ -81,25 +60,6 @@ void MultiplyKernel(const Context& dev_ctx,
using complex64 = ::phi::dtype::complex<float>; using complex64 = ::phi::dtype::complex<float>;
using complex128 = ::phi::dtype::complex<double>; using complex128 = ::phi::dtype::complex<double>;
PD_REGISTER_KERNEL(
mean, CPU, ALL_LAYOUT, phi::MeanKernel, float, double, bool) {}
PD_REGISTER_KERNEL(sum,
CPU,
ALL_LAYOUT,
phi::SumKernel,
bool,
float,
double,
phi::dtype::float16,
int16_t,
int,
int64_t,
complex64,
complex128) {
kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
}
PD_REGISTER_KERNEL(add, PD_REGISTER_KERNEL(add,
CPU, CPU,
ALL_LAYOUT, ALL_LAYOUT,
...@@ -147,32 +107,7 @@ PD_REGISTER_KERNEL(multiply, ...@@ -147,32 +107,7 @@ PD_REGISTER_KERNEL(multiply,
phi::dtype::bfloat16) {} phi::dtype::bfloat16) {}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PD_REGISTER_KERNEL(mean,
GPU,
ALL_LAYOUT,
phi::MeanKernel,
float,
double,
bool,
int,
int64_t,
phi::dtype::float16) {}
PD_REGISTER_KERNEL(sum,
GPU,
ALL_LAYOUT,
phi::SumKernel,
bool,
float,
double,
phi::dtype::float16,
phi::dtype::bfloat16,
int16_t,
int,
int64_t,
complex64,
complex128) {
kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
}
PD_REGISTER_KERNEL(add, PD_REGISTER_KERNEL(add,
GPU, GPU,
ALL_LAYOUT, ALL_LAYOUT,
......
...@@ -16,43 +16,8 @@ limitations under the License. */ ...@@ -16,43 +16,8 @@ limitations under the License. */
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/infermeta/binary.h" #include "paddle/phi/infermeta/binary.h"
#include "paddle/phi/infermeta/unary.h"
#include "paddle/phi/kernels/empty_kernel.h"
namespace phi { namespace phi {
template <typename T, typename Context>
void MeanRawKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
bool reduce_all,
DenseTensor* out);
template <typename T, typename Context>
void MeanKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
DenseTensor* out);
template <typename T, typename Context>
void SumRawKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
bool reduce_all,
DataType out_dtype,
DenseTensor* out);
template <typename T, typename Context>
void SumKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
DataType out_dtype,
bool keep_dim,
DenseTensor* out);
template <typename T, typename Context> template <typename T, typename Context>
void AddRawKernel(const Context& dev_ctx, void AddRawKernel(const Context& dev_ctx,
const DenseTensor& x, const DenseTensor& x,
...@@ -149,29 +114,4 @@ DenseTensor Multiply(const Context& dev_ctx, ...@@ -149,29 +114,4 @@ DenseTensor Multiply(const Context& dev_ctx,
return dense_out; return dense_out;
} }
template <typename T, typename Context>
DenseTensor Mean(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& axis,
bool keep_dim) {
DenseTensor dense_out;
MetaTensor meta_out(&dense_out);
SumRawInferMeta(x, axis, keep_dim, false, x.dtype(), &meta_out);
MeanKernel<T, Context>(dev_ctx, x, axis, keep_dim, &dense_out);
return dense_out;
}
template <typename T, typename Context>
DenseTensor Sum(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& axis,
DataType dtype,
bool keep_dim) {
DenseTensor dense_out;
MetaTensor meta_out(&dense_out);
SumInferMeta(x, axis, dtype, keep_dim, &meta_out);
SumKernel<T, Context>(dev_ctx, x, axis, dtype, keep_dim, &dense_out);
return dense_out;
}
} // namespace phi } // namespace phi
...@@ -19,17 +19,9 @@ ...@@ -19,17 +19,9 @@
namespace phi { namespace phi {
template <typename T, typename Context> template <typename T, typename Context>
void MinRawKernel(const Context& dev_ctx, void MultiplexGradKernel(const Context& ctx,
const DenseTensor& x, const DenseTensor& ids,
const std::vector<int64_t>& dims, const DenseTensor& out_grad,
bool keep_dim, std::vector<DenseTensor*> ins_grad);
bool reduce_all,
DenseTensor* out);
template <typename T, typename Context>
void MinKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
DenseTensor* out);
} // namespace phi } // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void MultiplexKernel(const Context& ctx,
const std::vector<const DenseTensor*>& ins,
const DenseTensor& ids,
DenseTensor* out);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void QrKernel(const Context& ctx,
const DenseTensor& x,
const std::string& mode,
DenseTensor* q,
DenseTensor* r);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/reduce_all_kernel.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
template <typename T, typename Context>
void AllKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
DenseTensor* out) {
bool reduce_all = false;
AllRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
}
} // namespace phi
PD_REGISTER_KERNEL(all, CPU, ALL_LAYOUT, phi::AllKernel, bool) {}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PD_REGISTER_KERNEL(all, GPU, ALL_LAYOUT, phi::AllKernel, bool) {}
#endif
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/reduce_kernel.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
template <typename T, typename Context>
void SumKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
DataType out_dtype,
bool keep_dim,
DenseTensor* out) {
bool reduce_all = false;
SumRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out_dtype, out);
}
template <typename T, typename Context>
void MeanKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
DenseTensor* out) {
bool reduce_all = false;
MeanRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
}
template <typename T, typename Context>
void ProdKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
DenseTensor* out) {
bool reduce_all = false;
ProdRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
}
template <typename T, typename Context>
void MaxKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
DenseTensor* out) {
bool reduce_all = false;
MaxRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
}
template <typename T, typename Context>
void MinKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
DenseTensor* out) {
bool reduce_all = false;
MinRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
}
template <typename T, typename Context>
void AllKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
DenseTensor* out) {
bool reduce_all = false;
AllRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
}
template <typename T, typename Context>
void AnyKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
DenseTensor* out) {
bool reduce_all = false;
AnyRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
}
} // namespace phi
using complex64 = ::phi::dtype::complex<float>;
using complex128 = ::phi::dtype::complex<double>;
PD_REGISTER_KERNEL(
mean, CPU, ALL_LAYOUT, phi::MeanKernel, float, double, bool) {}
PD_REGISTER_KERNEL(sum,
CPU,
ALL_LAYOUT,
phi::SumKernel,
bool,
float,
double,
phi::dtype::float16,
int16_t,
int,
int64_t,
complex64,
complex128) {
kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
}
PD_REGISTER_KERNEL(
prod, CPU, ALL_LAYOUT, phi::ProdKernel, float, double, int, int64_t) {}
PD_REGISTER_KERNEL(
max, CPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {}
PD_REGISTER_KERNEL(
min, CPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {}
PD_REGISTER_KERNEL(all, CPU, ALL_LAYOUT, phi::AllKernel, bool) {}
PD_REGISTER_KERNEL(any, CPU, ALL_LAYOUT, phi::AnyKernel, bool) {}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PD_REGISTER_KERNEL(mean,
GPU,
ALL_LAYOUT,
phi::MeanKernel,
float,
double,
bool,
int,
int64_t,
phi::dtype::float16) {}
PD_REGISTER_KERNEL(sum,
GPU,
ALL_LAYOUT,
phi::SumKernel,
bool,
float,
double,
phi::dtype::float16,
phi::dtype::bfloat16,
int16_t,
int,
int64_t,
complex64,
complex128) {
kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
}
PD_REGISTER_KERNEL(
prod, GPU, ALL_LAYOUT, phi::ProdKernel, float, double, int, int64_t) {}
PD_REGISTER_KERNEL(
max, GPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {}
PD_REGISTER_KERNEL(
min, GPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {}
PD_REGISTER_KERNEL(all, GPU, ALL_LAYOUT, phi::AllKernel, bool) {}
PD_REGISTER_KERNEL(any, GPU, ALL_LAYOUT, phi::AnyKernel, bool) {}
#endif
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/infermeta/unary.h"
#include "paddle/phi/kernels/empty_kernel.h"
namespace phi {
template <typename T, typename Context>
void SumRawKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
bool reduce_all,
DataType out_dtype,
DenseTensor* out);
template <typename T, typename Context>
void MeanRawKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
bool reduce_all,
DenseTensor* out);
template <typename T, typename Context>
void ProdRawKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
bool reduce_all,
DenseTensor* out);
template <typename T, typename Context>
void MaxRawKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
bool reduce_all,
DenseTensor* out);
template <typename T, typename Context>
void MinRawKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
bool reduce_all,
DenseTensor* out);
template <typename T, typename Context>
void AnyRawKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
bool reduce_all,
DenseTensor* out);
template <typename T, typename Context>
void AllRawKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
bool reduce_all,
DenseTensor* out);
template <typename T, typename Context>
void SumKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
DataType out_dtype,
bool keep_dim,
DenseTensor* out);
template <typename T, typename Context>
void MeanKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
DenseTensor* out);
template <typename T, typename Context>
void ProdKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
DenseTensor* out);
template <typename T, typename Context>
void MaxKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
DenseTensor* out);
template <typename T, typename Context>
void MinKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
DenseTensor* out);
template <typename T, typename Context>
void AnyKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
DenseTensor* out);
template <typename T, typename Context>
void AllKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
DenseTensor* out);
template <typename T, typename Context>
DenseTensor Mean(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& axis,
bool keep_dim) {
DenseTensor dense_out;
MetaTensor meta_out(&dense_out);
SumRawInferMeta(x, axis, keep_dim, false, x.dtype(), &meta_out);
MeanKernel<T, Context>(dev_ctx, x, axis, keep_dim, &dense_out);
return dense_out;
}
template <typename T, typename Context>
DenseTensor Sum(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& axis,
DataType dtype,
bool keep_dim) {
DenseTensor dense_out;
MetaTensor meta_out(&dense_out);
SumInferMeta(x, axis, dtype, keep_dim, &meta_out);
SumKernel<T, Context>(dev_ctx, x, axis, dtype, keep_dim, &dense_out);
return dense_out;
}
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/reduce_max_kernel.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
template <typename T, typename Context>
void MaxKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
DenseTensor* out) {
bool reduce_all = false;
MaxRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
}
} // namespace phi
PD_REGISTER_KERNEL(
max, CPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PD_REGISTER_KERNEL(
max, GPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {}
#endif
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/reduce_min_kernel.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
template <typename T, typename Context>
void MinKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
DenseTensor* out) {
bool reduce_all = false;
MinRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
}
} // namespace phi
PD_REGISTER_KERNEL(
min, CPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PD_REGISTER_KERNEL(
min, GPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {}
#endif
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/utils/optional.h"
namespace phi {
template <typename T, typename Context>
void RoiAlignGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& boxes,
paddle::optional<const DenseTensor&> boxes_num,
const DenseTensor& out_grad,
int pooled_height,
int pooled_width,
float spatial_scale,
int sampling_ratio,
bool aligned,
DenseTensor* dx);
} // namespace phi
...@@ -20,7 +20,7 @@ ...@@ -20,7 +20,7 @@
namespace phi { namespace phi {
template <typename T, typename Context> template <typename T, typename Context>
void ROIAlignKernel(const Context& dev_ctx, void RoiAlignKernel(const Context& dev_ctx,
const DenseTensor& x, const DenseTensor& x,
const DenseTensor& boxes, const DenseTensor& boxes,
paddle::optional<const DenseTensor&> boxes_num, paddle::optional<const DenseTensor&> boxes_num,
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/common/scalar_array.h"
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void RollGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& out_grad,
const ScalarArray& shifts,
const std::vector<int64_t>& axis,
DenseTensor* x_grad);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/common/scalar_array.h"
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void RollKernel(const Context& dev_ctx,
const DenseTensor& x,
const ScalarArray& shifts,
const std::vector<int64_t>& axis,
DenseTensor* out);
} // namespace phi
...@@ -27,7 +27,7 @@ void Conv3dGradKernel(const Context& dev_ctx, ...@@ -27,7 +27,7 @@ void Conv3dGradKernel(const Context& dev_ctx,
const SparseCooTensor& x, const SparseCooTensor& x,
const DenseTensor& rulebook, const DenseTensor& rulebook,
const DenseTensor& kernel, const DenseTensor& kernel,
const SparseCooTensor& out_grad, const DenseTensor& out_grad,
const std::vector<int>& paddings, const std::vector<int>& paddings,
const std::vector<int>& dilations, const std::vector<int>& dilations,
const std::vector<int>& strides, const std::vector<int>& strides,
...@@ -41,7 +41,7 @@ std::vector<DenseTensor> Conv3dGrad(const Context& dev_ctx, ...@@ -41,7 +41,7 @@ std::vector<DenseTensor> Conv3dGrad(const Context& dev_ctx,
const SparseCooTensor& x, const SparseCooTensor& x,
const DenseTensor& rulebook, const DenseTensor& rulebook,
const DenseTensor& kernel, const DenseTensor& kernel,
const SparseCooTensor& out_grad, const DenseTensor& out_grad,
const std::vector<int>& paddings, const std::vector<int>& paddings,
const std::vector<int>& dilations, const std::vector<int>& dilations,
const std::vector<int>& strides, const std::vector<int>& strides,
......
...@@ -34,7 +34,7 @@ using Dims4D = phi::funcs::sparse::Dims4D; ...@@ -34,7 +34,7 @@ using Dims4D = phi::funcs::sparse::Dims4D;
template <typename T, typename Context> template <typename T, typename Context>
void ProductRuleBook(const Context& dev_ctx, void ProductRuleBook(const Context& dev_ctx,
const SparseCooTensor& x, const SparseCooTensor& x,
const DenseTensor& kernel, const std::vector<int>& kernel_sizes,
const std::vector<int>& paddings, const std::vector<int>& paddings,
const std::vector<int>& dilations, const std::vector<int>& dilations,
const std::vector<int>& strides, const std::vector<int>& strides,
...@@ -42,19 +42,19 @@ void ProductRuleBook(const Context& dev_ctx, ...@@ -42,19 +42,19 @@ void ProductRuleBook(const Context& dev_ctx,
const bool subm, const bool subm,
DenseTensor* rulebook, DenseTensor* rulebook,
DenseTensor* counter_per_kernel) { DenseTensor* counter_per_kernel) {
const auto& kernel_dims = kernel.dims();
const int64_t non_zero_num = x.nnz(); const int64_t non_zero_num = x.nnz();
const auto& non_zero_indices = x.non_zero_indices(); const auto& non_zero_indices = x.non_zero_indices();
const int* indices_ptr = non_zero_indices.data<int>(); const int* indices_ptr = non_zero_indices.data<int>();
int* counter_ptr = counter_per_kernel->data<int>(); int* counter_ptr = counter_per_kernel->data<int>();
int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2]; int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
memset(counter_ptr, 0, kernel_size * sizeof(int)); memset(counter_ptr, 0, kernel_size * sizeof(int));
int rulebook_len = 0; int rulebook_len = 0;
// calc the rulebook_len // calc the rulebook_len
const auto& x_dims = x.dims(); const auto& x_dims = x.dims();
const Dims4D c_x_dims(x_dims[0], x_dims[3], x_dims[2], x_dims[1]); const Dims4D c_x_dims(x_dims[0], x_dims[3], x_dims[2], x_dims[1]);
const Dims4D c_kernel_dims(1, kernel_dims[2], kernel_dims[1], kernel_dims[0]); const Dims4D c_kernel_dims(
1, kernel_sizes[2], kernel_sizes[1], kernel_sizes[0]);
const Dims4D c_out_dims(out_dims[0], out_dims[3], out_dims[2], out_dims[1]); const Dims4D c_out_dims(out_dims[0], out_dims[3], out_dims[2], out_dims[1]);
const Dims4D c_paddings(1, paddings[2], paddings[1], paddings[0]); const Dims4D c_paddings(1, paddings[2], paddings[1], paddings[0]);
const Dims4D c_strides(1, strides[2], strides[1], strides[0]); const Dims4D c_strides(1, strides[2], strides[1], strides[0]);
...@@ -75,9 +75,9 @@ void ProductRuleBook(const Context& dev_ctx, ...@@ -75,9 +75,9 @@ void ProductRuleBook(const Context& dev_ctx,
auto f_calc_rulebook = [&](int* rulebook_ptr) { auto f_calc_rulebook = [&](int* rulebook_ptr) {
int kernel_index = 0, rulebook_index = 0; int kernel_index = 0, rulebook_index = 0;
for (int kz = 0; kz < kernel_dims[0]; kz++) { for (int kz = 0; kz < kernel_sizes[0]; kz++) {
for (int ky = 0; ky < kernel_dims[1]; ky++) { for (int ky = 0; ky < kernel_sizes[1]; ky++) {
for (int kx = 0; kx < kernel_dims[2]; kx++) { for (int kx = 0; kx < kernel_sizes[2]; kx++) {
++kernel_index; ++kernel_index;
for (int64_t i = 0; i < non_zero_num; i++) { for (int64_t i = 0; i < non_zero_num; i++) {
int batch = indices_ptr[i]; int batch = indices_ptr[i];
......
...@@ -33,7 +33,7 @@ void Conv3dGradKernel(const Context& dev_ctx, ...@@ -33,7 +33,7 @@ void Conv3dGradKernel(const Context& dev_ctx,
const SparseCooTensor& x, const SparseCooTensor& x,
const DenseTensor& rulebook, const DenseTensor& rulebook,
const DenseTensor& kernel, const DenseTensor& kernel,
const SparseCooTensor& out_grad, const DenseTensor& out_grad,
const std::vector<int>& paddings, const std::vector<int>& paddings,
const std::vector<int>& dilations, const std::vector<int>& dilations,
const std::vector<int>& strides, const std::vector<int>& strides,
...@@ -113,7 +113,7 @@ void Conv3dGradKernel(const Context& dev_ctx, ...@@ -113,7 +113,7 @@ void Conv3dGradKernel(const Context& dev_ctx,
rulebook_len, rulebook_len,
in_channels, in_channels,
in_features_ptr); in_features_ptr);
Gather<T>(out_grad.non_zero_elements().data<T>(), Gather<T>(out_grad.data<T>(),
rulebook_ptr + rulebook_len * 2, rulebook_ptr + rulebook_len * 2,
rulebook_len, rulebook_len,
out_channels, out_channels,
......
...@@ -44,8 +44,13 @@ void Conv3dKernel(const Context& dev_ctx, ...@@ -44,8 +44,13 @@ void Conv3dKernel(const Context& dev_ctx,
const auto& kernel_dims = kernel.dims(); const auto& kernel_dims = kernel.dims();
int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2]; int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
DDim out_dims = {1, 1, 1, 1, 1}; DDim out_dims = {1, 1, 1, 1, 1};
std::vector<int> kernel_sizes(kernel_dims.size());
for (int i = 0; i < kernel_dims.size(); i++) {
kernel_sizes[i] = kernel_dims[i];
}
phi::funcs::sparse::GetOutShape( phi::funcs::sparse::GetOutShape(
x_dims, kernel_dims, paddings, dilations, strides, &out_dims); x_dims, kernel_sizes, paddings, dilations, strides, &out_dims);
const int in_channels = kernel_dims[3]; const int in_channels = kernel_dims[3];
const int out_channels = kernel_dims[4]; const int out_channels = kernel_dims[4];
...@@ -63,7 +68,7 @@ void Conv3dKernel(const Context& dev_ctx, ...@@ -63,7 +68,7 @@ void Conv3dKernel(const Context& dev_ctx,
ProductRuleBook<T, Context>(dev_ctx, ProductRuleBook<T, Context>(dev_ctx,
x, x,
kernel, kernel_sizes,
subm_paddings, subm_paddings,
dilations, dilations,
subm_strides, subm_strides,
......
...@@ -23,11 +23,15 @@ limitations under the License. */ ...@@ -23,11 +23,15 @@ limitations under the License. */
#include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/kernels/funcs/index_impl.cu.h" #include "paddle/phi/kernels/funcs/index_impl.cu.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/primitive/compute_primitives.h"
#include "paddle/phi/kernels/sparse/convolution_kernel.h" #include "paddle/phi/kernels/sparse/convolution_kernel.h"
namespace phi { namespace phi {
namespace sparse { namespace sparse {
using Dims4D = phi::funcs::sparse::Dims4D;
// TODO(zhangkaihuo): After the GatherCUDAKernel is migrated to phi, replace // TODO(zhangkaihuo): After the GatherCUDAKernel is migrated to phi, replace
// this kernel with phi::GatherCUDAKernel; // this kernel with phi::GatherCUDAKernel;
// Vectorization can be used to improve read and write bandwidth // Vectorization can be used to improve read and write bandwidth
...@@ -139,5 +143,494 @@ inline int* SortedAndUniqueIndex(const Context& dev_ctx, ...@@ -139,5 +143,494 @@ inline int* SortedAndUniqueIndex(const Context& dev_ctx,
return new_end.first; return new_end.first;
} }
template <typename T>
__global__ void SetFlagAndUpdateCounterKernel(const int* indexs,
const int n,
const int rulebook_len,
const int kernel_size,
T* rulebook_ptr,
int* counter_ptr) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
extern __shared__ int cache_count[]; // kernel_size
for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
cache_count[i] = 0;
}
__syncthreads();
for (int i = tid; i < n; i += gridDim.x * blockDim.x) {
int index = indexs[i];
int kernel_index = rulebook_ptr[index];
rulebook_ptr[index + rulebook_len] = -1;
rulebook_ptr[index + 2 * rulebook_len] = -1;
rulebook_ptr[index] = -1;
atomicAdd(&cache_count[kernel_index], 1);
}
__syncthreads();
for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
atomicSub(&counter_ptr[i], cache_count[i]);
}
}
/**
* @brief: update the out index and indices
* unique_keys: save the index of the output feature list
* unique_values: indiates the index of key before deduplication
* out_indexs: indicates the position of the output index in the rulebook
* rulebook_len: indicates the length of rulebook
* out_dims: indicates the output dims
* out_indices: the indices of output, out_indices = IndexToPoint(unique_keys)
* rulebook_out_indexs: the output index in rulebook
**/
template <typename T>
__global__ void UpdateIndexKernel(const int* unique_keys,
const int* unique_values,
const int* out_indexs,
const int non_zero_num,
const int rulebook_len,
const Dims4D out_dims,
T* out_indices,
T* rulebook_out_indexs) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
const int index = unique_keys[i];
int batch, x, y, z;
phi::funcs::sparse::IndexToPoint<Dims4D>(
index, out_dims, &batch, &x, &y, &z);
// get out indices
out_indices[i] = batch;
out_indices[i + non_zero_num] = z;
out_indices[i + non_zero_num * 2] = y;
out_indices[i + non_zero_num * 3] = x;
// update rulebook
int start = unique_values[i];
int end = i == non_zero_num - 1 ? rulebook_len : unique_values[i + 1];
// max(end-start) = kernel_size
for (int j = start; j < end; j++) {
rulebook_out_indexs[out_indexs[j]] = i;
}
}
}
// brief: calculation the distance between start and end
template <typename T>
__global__ void DistanceKernel(const T* start, const T* end, int* distance) {
if (threadIdx.x == 0) {
*distance = end - start;
}
}
/**
* @brief product rulebook
* for input_i in x_indices:
* if input_i participate in the convolution calculation:
* infer the output_i by input_i and kernel_i
* save output_i
*
* x_indices: the indices of input features
* x_dims: the input dims
* kernel_dims: the kernel dims
* out_dims: the output dims
* non_zero_num: the number of input features
* rulebook: the rulebook to save the kernel index, input index and output index
* counter: save the number of times each location in the kernel participates in
*the caculation
**/
template <typename T>
__global__ void ProductRuleBookKernel(const T* x_indices,
const Dims4D x_dims,
const Dims4D kernel_dims,
const Dims4D out_dims,
const int64_t non_zero_num,
const Dims4D paddings,
const Dims4D dilations,
const Dims4D strides,
const bool subm,
T* rulebook,
int* counter,
int* in_indexs) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
extern __shared__ int counter_buf[]; // kernel_size
const int kernel_size = kernel_dims[3] * kernel_dims[2] * kernel_dims[1];
const int offset = kernel_size * non_zero_num;
for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
counter_buf[i] = 0;
}
__syncthreads();
for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
int kernel_index = 0;
int batch = x_indices[i];
int in_z = x_indices[i + non_zero_num];
int in_y = x_indices[i + 2 * non_zero_num];
int in_x = x_indices[i + 3 * non_zero_num];
if (subm) {
in_indexs[i] = PointToIndex(batch, in_x, in_y, in_z, x_dims);
}
for (int kz = 0; kz < kernel_dims[1]; kz++) {
for (int ky = 0; ky < kernel_dims[2]; ky++) {
for (int kx = 0; kx < kernel_dims[3]; kx++) {
int in_i = -1, out_index = -1, kernel_i = -1;
if (phi::funcs::sparse::Check(x_dims,
kernel_dims,
paddings,
dilations,
strides,
in_x,
in_y,
in_z,
kx,
ky,
kz)) {
int out_z = (in_z + paddings[1] - kz * dilations[1]) / strides[1];
int out_y = (in_y + paddings[2] - ky * dilations[2]) / strides[2];
int out_x = (in_x + paddings[3] - kx * dilations[3]) / strides[3];
in_i = i;
out_index = phi::funcs::sparse::PointToIndex<Dims4D>(
batch, out_x, out_y, out_z, out_dims);
atomicAdd(&counter_buf[kernel_index], 1);
kernel_i = kernel_index;
}
rulebook[kernel_index * non_zero_num + i] = kernel_i;
rulebook[kernel_index * non_zero_num + offset + i] = in_i;
rulebook[kernel_index * non_zero_num + offset * 2 + i] = out_index;
++kernel_index;
}
}
}
}
__syncthreads();
for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
atomicAdd(&counter[i], counter_buf[i]);
}
}
// the basic algorithm can refer to convolution_kernel.cc or
// the second paper
// example:
// 1. the rulebook:
// the kernel_index: 0, 0, 0, 1, 1, 1, 2, 2, ....
// the out_index(key): 20, 30, 33, 30, 33, 20, 25
// 2. mark the index of out_index(value): 0, 1, 2, 3, 4, 5, 6, ....
// 3. sorted the (key, value)
// 4. unique the (key, value):
// unique_key: 20, 25, 30, 33
// unique_values: 0, 2, 3, 5
// the index of unique_values is: 0, 1, 2, 3
// 5. update the out_index by unique_key, uniqe_value and the index of
// unique_value:
// the new out_index: 0, 2, 3, 2, 3, 0, 1
template <typename T, typename Context>
int ProductRuleBook(const Context& dev_ctx,
const SparseCooTensor& x,
const std::vector<int>& kernel_sizes,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
const std::vector<int>& strides,
const DDim& out_dims,
const bool subm,
DenseTensor* rulebook,
DenseTensor* counter_per_kernel,
DenseTensor* offsets_per_kernel,
DenseTensor* out_index,
DenseTensor* unique_key,
DenseTensor* unique_value,
SparseCooTensor* out,
std::vector<int>* h_counter,
std::vector<int>* h_offsets) {
const int64_t non_zero_num = x.nnz();
const auto& non_zero_indices = x.non_zero_indices();
const int* indices_ptr = non_zero_indices.data<int>();
DenseTensor in_indexs = phi::Empty<Context>(
dev_ctx, DenseTensorMeta(DataType::INT32, {x.nnz()}, DataLayout::NCHW));
int* counter_ptr = counter_per_kernel->data<int>();
int* offsets_ptr = offsets_per_kernel->data<int>();
int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
const int rulebook_rows = 3;
const int rulebook_cols = kernel_size * non_zero_num;
rulebook->ResizeAndAllocate({rulebook_rows, rulebook_cols});
int* rulebook_ptr = rulebook->data<int>();
const auto x_dims = x.dims();
Dims4D d_x_dims(x_dims[0], x_dims[3], x_dims[2], x_dims[1]);
Dims4D d_kernel_dims(1, kernel_sizes[2], kernel_sizes[1], kernel_sizes[0]);
Dims4D d_out_dims(out_dims[0], out_dims[3], out_dims[2], out_dims[1]);
Dims4D d_paddings(1, paddings[2], paddings[1], paddings[0]);
Dims4D d_strides(1, strides[2], strides[1], strides[0]);
Dims4D d_dilations(1, dilations[2], dilations[1], dilations[0]);
// 1. product rule book
phi::funcs::SetConstant<Context, int> set_zero;
set_zero(dev_ctx, counter_per_kernel, 0);
auto config =
phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
ProductRuleBookKernel<int><<<config.block_per_grid.x,
config.thread_per_block.x,
kernel_size * sizeof(int),
dev_ctx.stream()>>>(indices_ptr,
d_x_dims,
d_kernel_dims,
d_out_dims,
non_zero_num,
d_paddings,
d_dilations,
d_strides,
subm,
rulebook_ptr,
counter_ptr,
in_indexs.data<int>());
// 2. remove -1
#ifdef PADDLE_WITH_HIP
int* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
#else
int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
#endif
rulebook_ptr,
rulebook_ptr + rulebook_rows * rulebook_cols,
-1);
DistanceKernel<int><<<1, 1, 0, dev_ctx.stream()>>>(
rulebook_ptr, last, rulebook_ptr + 3 * kernel_size * non_zero_num - 1);
int rulebook_len = 0;
phi::backends::gpu::GpuMemcpyAsync(
&rulebook_len,
rulebook_ptr + 3 * kernel_size * non_zero_num - 1,
sizeof(int),
#ifdef PADDLE_WITH_HIP
hipMemcpyDeviceToHost,
#else
cudaMemcpyDeviceToHost,
#endif
dev_ctx.stream());
rulebook_len /= 3;
dev_ctx.Wait();
if (subm) {
// At present, hashtable is not used to map the input and output indexes.
// At present, the intermediate output index is generated by normal
// convolution,
// and then the intermediate output index is subtracted from the input index
// to obain the rulebook.
// get difference
int32_t* A_key_ptr = rulebook_ptr + 2 * rulebook_len;
int32_t* B_key_ptr = in_indexs.data<int>();
DenseTensor A_val = phi::Empty<Context>(
dev_ctx,
DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW));
DenseTensor B_val = phi::Empty<Context>(
dev_ctx, DenseTensorMeta(DataType::INT32, {x.nnz()}, DataLayout::NCHW));
phi::IndexKernel<int, kps::IdentityFunctor<int>>(
dev_ctx, &A_val, kps::IdentityFunctor<int>());
phi::IndexKernel<int, kps::IdentityFunctor<int>>(
dev_ctx, &B_val, kps::IdentityFunctor<int>());
DenseTensor key_result = phi::Empty<Context>(
dev_ctx,
DenseTensorMeta(DataType::INT32, {rulebook_len + 1}, DataLayout::NCHW));
DenseTensor val_result = phi::Empty<Context>(
dev_ctx,
DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW));
#ifdef PADDLE_WITH_HIP
thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
#else
thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
#endif
counter_ptr,
counter_ptr + kernel_size,
offsets_ptr);
std::vector<int> offsets(kernel_size, 0);
// TODO(zhangkaihuo): used unified memcpy interface
phi::backends::gpu::GpuMemcpyAsync(offsets.data(),
offsets_ptr,
kernel_size * sizeof(int),
#ifdef PADDLE_WITH_HIP
hipMemcpyDeviceToHost,
#else
cudaMemcpyDeviceToHost,
#endif
dev_ctx.stream());
dev_ctx.Wait();
thrust::pair<int*, int*> end;
// Because set_diff does not support duplicate data, set_diff is performed
// separately for each segment of data.
// TODO(zhangkaihuo): Using hashtable here may get better performance,
// further tests ared needed.
for (int i = 0; i < kernel_size; i++) {
int start = offsets[i];
int stop = i == kernel_size - 1 ? rulebook_len : offsets[i + 1];
int* key_result_start = (i == 0 ? key_result.data<int>() : end.first);
int* val_result_start = i == 0 ? val_result.data<int>() : end.second;
end =
#ifdef PADDLE_WITH_HIP
thrust::set_difference_by_key(thrust::hip::par.on(dev_ctx.stream()),
#else
thrust::set_difference_by_key(thrust::cuda::par.on(dev_ctx.stream()),
#endif
A_key_ptr + start,
A_key_ptr + stop,
B_key_ptr,
B_key_ptr + x.nnz(),
A_val.data<int>() + start,
B_val.data<int>(),
key_result_start,
val_result_start);
}
DistanceKernel<int><<<1, 1, 0, dev_ctx.stream()>>>(
key_result.data<int>(),
end.first,
key_result.data<int>() + rulebook_len);
int len = 0;
phi::backends::gpu::GpuMemcpyAsync(&len,
key_result.data<int>() + rulebook_len,
sizeof(int),
#ifdef PADDLE_WITH_HIP
hipMemcpyDeviceToHost,
#else
cudaMemcpyDeviceToHost,
#endif
dev_ctx.stream());
dev_ctx.Wait();
// set the diff value = -1, and update counter
auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, len, 1);
SetFlagAndUpdateCounterKernel<int><<<config.block_per_grid.x,
config.thread_per_block,
kernel_size * sizeof(int),
dev_ctx.stream()>>>(
val_result.data<int>(),
len,
rulebook_len,
kernel_size,
rulebook_ptr,
counter_ptr);
// remove -1
#ifdef PADDLE_WITH_HIP
int* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
#else
int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
#endif
rulebook_ptr,
rulebook_ptr + 3 * rulebook_len,
-1);
DistanceKernel<int><<<1, 1, 0, dev_ctx.stream()>>>(
rulebook_ptr, last, key_result.data<int>() + rulebook_len);
phi::backends::gpu::GpuMemcpyAsync(&rulebook_len,
key_result.data<int>() + rulebook_len,
sizeof(int),
#ifdef PADDLE_WITH_HIP
hipMemcpyDeviceToHost,
#else
cudaMemcpyDeviceToHost,
#endif
dev_ctx.stream());
dev_ctx.Wait();
rulebook_len /= 3;
}
#ifdef PADDLE_WITH_HIP
thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
#else
thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
#endif
counter_ptr,
counter_ptr + kernel_size,
offsets_ptr);
#ifdef PADDLE_WITH_HIP
phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0],
counter_ptr,
kernel_size * sizeof(int),
hipMemcpyDeviceToHost,
dev_ctx.stream());
phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0],
offsets_ptr,
kernel_size * sizeof(int),
hipMemcpyDeviceToHost,
dev_ctx.stream());
#else
phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0],
counter_ptr,
kernel_size * sizeof(int),
cudaMemcpyDeviceToHost,
dev_ctx.stream());
phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0],
offsets_ptr,
kernel_size * sizeof(int),
cudaMemcpyDeviceToHost,
dev_ctx.stream());
#endif
rulebook->Resize({rulebook_rows, rulebook_len});
// 3. sorted or merge the out index
out_index->ResizeAndAllocate({rulebook_len});
unique_value->ResizeAndAllocate({rulebook_len});
unique_key->ResizeAndAllocate({rulebook_len});
int* out_index_ptr = out_index->data<int>();
int* unique_value_ptr = unique_value->data<int>();
int* unique_key_ptr = unique_key->data<int>();
int* new_end = SortedAndUniqueIndex(dev_ctx,
rulebook_ptr + 2 * rulebook_len,
rulebook_len,
out_index,
unique_key,
unique_value);
// thrust::distance doesn't support stream parameters
// const int out_non_zero_num = thrust::distance(unique_key_ptr,
// new_end.first);
DistanceKernel<int><<<1, 1>>>(
unique_key_ptr,
new_end,
rulebook_ptr + rulebook_rows * rulebook_cols - 1);
int out_non_zero_num = 0;
#ifdef PADDLE_WITH_HIP
phi::backends::gpu::GpuMemcpyAsync(
&out_non_zero_num,
rulebook_ptr + rulebook_rows * rulebook_cols - 1,
sizeof(int),
hipMemcpyDeviceToHost,
dev_ctx.stream());
#else
phi::backends::gpu::GpuMemcpyAsync(
&out_non_zero_num,
rulebook_ptr + rulebook_rows * rulebook_cols - 1,
sizeof(int),
cudaMemcpyDeviceToHost,
dev_ctx.stream());
#endif
dev_ctx.Wait();
// 5. update out_indices and rulebook by unique_value_ptr
const int64_t sparse_dim = 4;
DenseTensorMeta indices_meta(
DataType::INT32, {sparse_dim, out_non_zero_num}, DataLayout::NCHW);
DenseTensorMeta values_meta(
x.dtype(), {out_non_zero_num, kernel_sizes[4]}, x.layout());
phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta));
phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta));
int* out_indices_ptr = out_indices.data<int>();
config =
phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_non_zero_num, 1);
UpdateIndexKernel<int><<<config.block_per_grid.x,
config.thread_per_block.x,
0,
dev_ctx.stream()>>>(unique_key_ptr,
unique_value_ptr,
out_index_ptr,
out_non_zero_num,
rulebook_len,
d_out_dims,
out_indices_ptr,
rulebook_ptr + 2 * rulebook_len);
out->SetMember(out_indices, out_values, out_dims, true);
return rulebook_len;
}
} // namespace sparse } // namespace sparse
} // namespace phi } // namespace phi
...@@ -38,7 +38,7 @@ void Conv3dGradKernel(const Context& dev_ctx, ...@@ -38,7 +38,7 @@ void Conv3dGradKernel(const Context& dev_ctx,
const SparseCooTensor& x, const SparseCooTensor& x,
const DenseTensor& rulebook, const DenseTensor& rulebook,
const DenseTensor& kernel, const DenseTensor& kernel,
const SparseCooTensor& out_grad, const DenseTensor& out_grad,
const std::vector<int>& paddings, const std::vector<int>& paddings,
const std::vector<int>& dilations, const std::vector<int>& dilations,
const std::vector<int>& strides, const std::vector<int>& strides,
...@@ -140,8 +140,7 @@ void Conv3dGradKernel(const Context& dev_ctx, ...@@ -140,8 +140,7 @@ void Conv3dGradKernel(const Context& dev_ctx,
GatherKernel<T, int><<<config.block_per_grid.x, GatherKernel<T, int><<<config.block_per_grid.x,
config.thread_per_block.x, config.thread_per_block.x,
0, 0,
dev_ctx.stream()>>>( dev_ctx.stream()>>>(out_grad.data<T>(),
out_grad.non_zero_elements().data<T>(),
rulebook_ptr + rulebook_len * 2, rulebook_ptr + rulebook_len * 2,
out_grad_features_ptr, out_grad_features_ptr,
rulebook_len, rulebook_len,
......
...@@ -12,515 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,515 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <thrust/execution_policy.h>
#include <thrust/remove.h>
#include <thrust/sort.h>
#include <thrust/unique.h>
#include "paddle/phi/api/lib/utils/allocator.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_meta.h" #include "paddle/phi/core/tensor_meta.h"
#include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/index_impl.cu.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/primitive/compute_primitives.h"
#include "paddle/phi/kernels/sparse/convolution_kernel.h" #include "paddle/phi/kernels/sparse/convolution_kernel.h"
#include "paddle/phi/kernels/sparse/gpu/convolution.cu.h" #include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
namespace phi { namespace phi {
namespace sparse { namespace sparse {
using Dims4D = phi::funcs::sparse::Dims4D;
__global__ void SetFlagAndUpdateCounterKernel(const int* indexs,
const int n,
const int rulebook_len,
const int kernel_size,
int* rulebook_ptr,
int* counter_ptr) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
extern __shared__ int cache_count[]; // kernel_size
for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
cache_count[i] = 0;
}
__syncthreads();
for (int i = tid; i < n; i += gridDim.x * blockDim.x) {
int index = indexs[i];
int kernel_index = rulebook_ptr[index];
rulebook_ptr[index + rulebook_len] = -1;
rulebook_ptr[index + 2 * rulebook_len] = -1;
rulebook_ptr[index] = -1;
atomicAdd(&cache_count[kernel_index], 1);
}
__syncthreads();
for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
atomicSub(&counter_ptr[i], cache_count[i]);
}
}
/**
* @brief: update the out index and indices
* unique_keys: save the index of the output feature list
* unique_values: indiates the index of key before deduplication
* out_indexs: indicates the position of the output index in the rulebook
* rulebook_len: indicates the length of rulebook
* out_dims: indicates the output dims
* out_indices: the indices of output, out_indices = IndexToPoint(unique_keys)
* rulebook_out_indexs: the output index in rulebook
**/
__global__ void UpdateIndexKernel(const int* unique_keys,
const int* unique_values,
const int* out_indexs,
const int non_zero_num,
const int rulebook_len,
const Dims4D out_dims,
int* out_indices,
int* rulebook_out_indexs) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
const int index = unique_keys[i];
int batch, x, y, z;
phi::funcs::sparse::IndexToPoint<Dims4D>(
index, out_dims, &batch, &x, &y, &z);
// get out indices
out_indices[i] = batch;
out_indices[i + non_zero_num] = z;
out_indices[i + non_zero_num * 2] = y;
out_indices[i + non_zero_num * 3] = x;
// update rulebook
int start = unique_values[i];
int end = i == non_zero_num - 1 ? rulebook_len : unique_values[i + 1];
// max(end-start) = kernel_size
for (int j = start; j < end; j++) {
rulebook_out_indexs[out_indexs[j]] = i;
}
}
}
/**
* @brief product rulebook
* for input_i in x_indices:
* if input_i participate in the convolution calculation:
* infer the output_i by input_i and kernel_i
* save output_i
*
* x_indices: the indices of input features
* x_dims: the input dims
* kernel_dims: the kernel dims
* out_dims: the output dims
* non_zero_num: the number of input features
* rulebook: the rulebook to save the kernel index, input index and output index
* counter: save the number of times each location in the kernel participates in
*the caculation
**/
__global__ void ProductRuleBookKernel(const int* x_indices,
const Dims4D x_dims,
const Dims4D kernel_dims,
const Dims4D out_dims,
const int64_t non_zero_num,
const Dims4D paddings,
const Dims4D dilations,
const Dims4D strides,
const bool subm,
int* rulebook,
int* counter,
int* in_indexs) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
extern __shared__ int counter_buf[]; // kernel_size
const int kernel_size = kernel_dims[3] * kernel_dims[2] * kernel_dims[1];
const int offset = kernel_size * non_zero_num;
for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
counter_buf[i] = 0;
}
__syncthreads();
for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
int kernel_index = 0;
int batch = x_indices[i];
int in_z = x_indices[i + non_zero_num];
int in_y = x_indices[i + 2 * non_zero_num];
int in_x = x_indices[i + 3 * non_zero_num];
if (subm) {
in_indexs[i] = PointToIndex(batch, in_x, in_y, in_z, x_dims);
}
for (int kz = 0; kz < kernel_dims[1]; kz++) {
for (int ky = 0; ky < kernel_dims[2]; ky++) {
for (int kx = 0; kx < kernel_dims[3]; kx++) {
int in_i = -1, out_index = -1, kernel_i = -1;
if (phi::funcs::sparse::Check(x_dims,
kernel_dims,
paddings,
dilations,
strides,
in_x,
in_y,
in_z,
kx,
ky,
kz)) {
int out_z = (in_z + paddings[1] - kz * dilations[1]) / strides[1];
int out_y = (in_y + paddings[2] - ky * dilations[2]) / strides[2];
int out_x = (in_x + paddings[3] - kx * dilations[3]) / strides[3];
in_i = i;
out_index = phi::funcs::sparse::PointToIndex<Dims4D>(
batch, out_x, out_y, out_z, out_dims);
atomicAdd(&counter_buf[kernel_index], 1);
kernel_i = kernel_index;
}
rulebook[kernel_index * non_zero_num + i] = kernel_i;
rulebook[kernel_index * non_zero_num + offset + i] = in_i;
rulebook[kernel_index * non_zero_num + offset * 2 + i] = out_index;
++kernel_index;
}
}
}
}
__syncthreads();
for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
atomicAdd(&counter[i], counter_buf[i]);
}
}
// brief: calculation the distance between start and end
__global__ void DistanceKernel(const int* start,
const int* end,
int* distance) {
if (threadIdx.x == 0) {
*distance = end - start;
}
}
// the basic algorithm can refer to convolution_kernel.cc or
// the second paper
// example:
// 1. the rulebook:
// the kernel_index: 0, 0, 0, 1, 1, 1, 2, 2, ....
// the out_index(key): 20, 30, 33, 30, 33, 20, 25
// 2. mark the index of out_index(value): 0, 1, 2, 3, 4, 5, 6, ....
// 3. sorted the (key, value)
// 4. unique the (key, value):
// unique_key: 20, 25, 30, 33
// unique_values: 0, 2, 3, 5
// the index of unique_values is: 0, 1, 2, 3
// 5. update the out_index by unique_key, uniqe_value and the index of
// unique_value:
// the new out_index: 0, 2, 3, 2, 3, 0, 1
template <typename T, typename Context>
int ProductRuleBook(const Context& dev_ctx,
const SparseCooTensor& x,
const DenseTensor& kernel,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
const std::vector<int>& strides,
const DDim& out_dims,
const bool subm,
DenseTensor* rulebook,
DenseTensor* counter_per_kernel,
DenseTensor* offsets_per_kernel,
DenseTensor* out_index,
DenseTensor* unique_key,
DenseTensor* unique_value,
SparseCooTensor* out,
std::vector<int>* h_counter,
std::vector<int>* h_offsets) {
const auto& kernel_dims = kernel.dims();
const int64_t non_zero_num = x.nnz();
const auto& non_zero_indices = x.non_zero_indices();
const int* indices_ptr = non_zero_indices.data<int>();
DenseTensor in_indexs = phi::Empty<Context>(
dev_ctx, DenseTensorMeta(DataType::INT32, {x.nnz()}, DataLayout::NCHW));
int* counter_ptr = counter_per_kernel->data<int>();
int* offsets_ptr = offsets_per_kernel->data<int>();
int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
const int rulebook_rows = 3;
const int rulebook_cols = kernel_size * non_zero_num;
rulebook->ResizeAndAllocate({rulebook_rows, rulebook_cols});
int* rulebook_ptr = rulebook->data<int>();
const auto x_dims = x.dims();
Dims4D d_x_dims(x_dims[0], x_dims[3], x_dims[2], x_dims[1]);
Dims4D d_kernel_dims(1, kernel_dims[2], kernel_dims[1], kernel_dims[0]);
Dims4D d_out_dims(out_dims[0], out_dims[3], out_dims[2], out_dims[1]);
Dims4D d_paddings(1, paddings[2], paddings[1], paddings[0]);
Dims4D d_strides(1, strides[2], strides[1], strides[0]);
Dims4D d_dilations(1, dilations[2], dilations[1], dilations[0]);
// 1. product rule book
phi::funcs::SetConstant<Context, int> set_zero;
set_zero(dev_ctx, counter_per_kernel, 0);
auto config =
phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
ProductRuleBookKernel<<<config.block_per_grid.x,
config.thread_per_block.x,
kernel_size * sizeof(int),
dev_ctx.stream()>>>(indices_ptr,
d_x_dims,
d_kernel_dims,
d_out_dims,
non_zero_num,
d_paddings,
d_dilations,
d_strides,
subm,
rulebook_ptr,
counter_ptr,
in_indexs.data<int>());
// 2. remove -1
#ifdef PADDLE_WITH_HIP
int* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
#else
int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
#endif
rulebook_ptr,
rulebook_ptr + rulebook_rows * rulebook_cols,
-1);
DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>(
rulebook_ptr, last, rulebook_ptr + 3 * kernel_size * non_zero_num - 1);
int rulebook_len = 0;
phi::backends::gpu::GpuMemcpyAsync(
&rulebook_len,
rulebook_ptr + 3 * kernel_size * non_zero_num - 1,
sizeof(int),
#ifdef PADDLE_WITH_HIP
hipMemcpyDeviceToHost,
#else
cudaMemcpyDeviceToHost,
#endif
dev_ctx.stream());
rulebook_len /= 3;
dev_ctx.Wait();
if (subm) {
// At present, hashtable is not used to map the input and output indexes.
// At present, the intermediate output index is generated by normal
// convolution,
// and then the intermediate output index is subtracted from the input index
// to obain the rulebook.
// get difference
int32_t* A_key_ptr = rulebook_ptr + 2 * rulebook_len;
int32_t* B_key_ptr = in_indexs.data<int>();
DenseTensor A_val = phi::Empty<Context>(
dev_ctx,
DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW));
DenseTensor B_val = phi::Empty<Context>(
dev_ctx, DenseTensorMeta(DataType::INT32, {x.nnz()}, DataLayout::NCHW));
phi::IndexKernel<int, kps::IdentityFunctor<int>>(
dev_ctx, &A_val, kps::IdentityFunctor<int>());
phi::IndexKernel<int, kps::IdentityFunctor<int>>(
dev_ctx, &B_val, kps::IdentityFunctor<int>());
DenseTensor key_result = phi::Empty<Context>(
dev_ctx,
DenseTensorMeta(DataType::INT32, {rulebook_len + 1}, DataLayout::NCHW));
DenseTensor val_result = phi::Empty<Context>(
dev_ctx,
DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW));
#ifdef PADDLE_WITH_HIP
thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
#else
thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
#endif
counter_ptr,
counter_ptr + kernel_size,
offsets_ptr);
std::vector<int> offsets(kernel_size, 0);
// TODO(zhangkaihuo): used unified memcpy interface
phi::backends::gpu::GpuMemcpyAsync(offsets.data(),
offsets_ptr,
kernel_size * sizeof(int),
#ifdef PADDLE_WITH_HIP
hipMemcpyDeviceToHost,
#else
cudaMemcpyDeviceToHost,
#endif
dev_ctx.stream());
dev_ctx.Wait();
thrust::pair<int*, int*> end;
// Because set_diff does not support duplicate data, set_diff is performed
// separately for each segment of data.
// TODO(zhangkaihuo): Using hashtable here may get better performance,
// further tests ared needed.
for (int i = 0; i < kernel_size; i++) {
int start = offsets[i];
int stop = i == kernel_size - 1 ? rulebook_len : offsets[i + 1];
int* key_result_start = (i == 0 ? key_result.data<int>() : end.first);
int* val_result_start = i == 0 ? val_result.data<int>() : end.second;
end =
#ifdef PADDLE_WITH_HIP
thrust::set_difference_by_key(thrust::hip::par.on(dev_ctx.stream()),
#else
thrust::set_difference_by_key(thrust::cuda::par.on(dev_ctx.stream()),
#endif
A_key_ptr + start,
A_key_ptr + stop,
B_key_ptr,
B_key_ptr + x.nnz(),
A_val.data<int>() + start,
B_val.data<int>(),
key_result_start,
val_result_start);
}
DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>(
key_result.data<int>(),
end.first,
key_result.data<int>() + rulebook_len);
int len = 0;
phi::backends::gpu::GpuMemcpyAsync(&len,
key_result.data<int>() + rulebook_len,
sizeof(int),
#ifdef PADDLE_WITH_HIP
hipMemcpyDeviceToHost,
#else
cudaMemcpyDeviceToHost,
#endif
dev_ctx.stream());
dev_ctx.Wait();
// set the diff value = -1, and update counter
auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, len, 1);
SetFlagAndUpdateCounterKernel<<<config.block_per_grid.x,
config.thread_per_block,
kernel_size * sizeof(int),
dev_ctx.stream()>>>(val_result.data<int>(),
len,
rulebook_len,
kernel_size,
rulebook_ptr,
counter_ptr);
// remove -1
#ifdef PADDLE_WITH_HIP
int* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
#else
int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
#endif
rulebook_ptr,
rulebook_ptr + 3 * rulebook_len,
-1);
DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>(
rulebook_ptr, last, key_result.data<int>() + rulebook_len);
phi::backends::gpu::GpuMemcpyAsync(&rulebook_len,
key_result.data<int>() + rulebook_len,
sizeof(int),
#ifdef PADDLE_WITH_HIP
hipMemcpyDeviceToHost,
#else
cudaMemcpyDeviceToHost,
#endif
dev_ctx.stream());
dev_ctx.Wait();
rulebook_len /= 3;
}
#ifdef PADDLE_WITH_HIP
thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
#else
thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
#endif
counter_ptr,
counter_ptr + kernel_size,
offsets_ptr);
#ifdef PADDLE_WITH_HIP
phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0],
counter_ptr,
kernel_size * sizeof(int),
hipMemcpyDeviceToHost,
dev_ctx.stream());
phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0],
offsets_ptr,
kernel_size * sizeof(int),
hipMemcpyDeviceToHost,
dev_ctx.stream());
#else
phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0],
counter_ptr,
kernel_size * sizeof(int),
cudaMemcpyDeviceToHost,
dev_ctx.stream());
phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0],
offsets_ptr,
kernel_size * sizeof(int),
cudaMemcpyDeviceToHost,
dev_ctx.stream());
#endif
rulebook->Resize({rulebook_rows, rulebook_len});
// 3. sorted or merge the out index
out_index->ResizeAndAllocate({rulebook_len});
unique_value->ResizeAndAllocate({rulebook_len});
unique_key->ResizeAndAllocate({rulebook_len});
int* out_index_ptr = out_index->data<int>();
int* unique_value_ptr = unique_value->data<int>();
int* unique_key_ptr = unique_key->data<int>();
int* new_end = SortedAndUniqueIndex(dev_ctx,
rulebook_ptr + 2 * rulebook_len,
rulebook_len,
out_index,
unique_key,
unique_value);
// thrust::distance doesn't support stream parameters
// const int out_non_zero_num = thrust::distance(unique_key_ptr,
// new_end.first);
DistanceKernel<<<1, 1>>>(unique_key_ptr,
new_end,
rulebook_ptr + rulebook_rows * rulebook_cols - 1);
int out_non_zero_num = 0;
#ifdef PADDLE_WITH_HIP
phi::backends::gpu::GpuMemcpyAsync(
&out_non_zero_num,
rulebook_ptr + rulebook_rows * rulebook_cols - 1,
sizeof(int),
hipMemcpyDeviceToHost,
dev_ctx.stream());
#else
phi::backends::gpu::GpuMemcpyAsync(
&out_non_zero_num,
rulebook_ptr + rulebook_rows * rulebook_cols - 1,
sizeof(int),
cudaMemcpyDeviceToHost,
dev_ctx.stream());
#endif
dev_ctx.Wait();
// 5. update out_indices and rulebook by unique_value_ptr
const int64_t sparse_dim = 4;
DenseTensorMeta indices_meta(
DataType::INT32, {sparse_dim, out_non_zero_num}, DataLayout::NCHW);
DenseTensorMeta values_meta(
x.dtype(), {out_non_zero_num, kernel_dims[4]}, x.layout());
phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta));
phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta));
int* out_indices_ptr = out_indices.data<int>();
config =
phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_non_zero_num, 1);
UpdateIndexKernel<<<config.block_per_grid.x,
config.thread_per_block.x,
0,
dev_ctx.stream()>>>(unique_key_ptr,
unique_value_ptr,
out_index_ptr,
out_non_zero_num,
rulebook_len,
d_out_dims,
out_indices_ptr,
rulebook_ptr + 2 * rulebook_len);
out->SetMember(out_indices, out_values, out_dims, true);
return rulebook_len;
}
/** /**
* x: (N, D, H, W, C) * x: (N, D, H, W, C)
* kernel: (D, H, W, C, OC) * kernel: (D, H, W, C, OC)
...@@ -545,9 +46,12 @@ void Conv3dKernel(const Context& dev_ctx, ...@@ -545,9 +46,12 @@ void Conv3dKernel(const Context& dev_ctx,
const auto& kernel_dims = kernel.dims(); const auto& kernel_dims = kernel.dims();
int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2]; int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
DDim out_dims = {1, 1, 1, 1, 1}; DDim out_dims = {1, 1, 1, 1, 1};
std::vector<int> kernel_sizes(kernel_dims.size());
for (int i = 0; i < kernel_dims.size(); i++) {
kernel_sizes[i] = kernel_dims[i];
}
phi::funcs::sparse::GetOutShape( phi::funcs::sparse::GetOutShape(
x_dims, kernel_dims, paddings, dilations, strides, &out_dims); x_dims, kernel_sizes, paddings, dilations, strides, &out_dims);
out->set_dims(out_dims);
const int in_channels = kernel_dims[3]; const int in_channels = kernel_dims[3];
const int out_channels = kernel_dims[4]; const int out_channels = kernel_dims[4];
std::vector<int> offsets(kernel_size + 1), h_counter(kernel_size); std::vector<int> offsets(kernel_size + 1), h_counter(kernel_size);
...@@ -574,7 +78,7 @@ void Conv3dKernel(const Context& dev_ctx, ...@@ -574,7 +78,7 @@ void Conv3dKernel(const Context& dev_ctx,
int n = ProductRuleBook<T, Context>(dev_ctx, int n = ProductRuleBook<T, Context>(dev_ctx,
x, x,
kernel, kernel_sizes,
subm_paddings, subm_paddings,
dilations, dilations,
subm_strides, subm_strides,
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void TrilTriuGradKernel(const Context& ctx,
const DenseTensor& out_grad,
int diagonal,
bool lower,
DenseTensor* x_grad);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void TrilTriuKernel(const Context& ctx,
const DenseTensor& x,
int diagonal,
bool lower,
DenseTensor* out);
} // namespace phi
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
namespace phi { namespace phi {
#define DefineActGradDepXOpArgMap(func_name, op_name, attrs) \ #define DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(func_name, op_name, attrs) \
KernelSignature func_name##GradOpArgumentMapping( \ KernelSignature func_name##GradOpArgumentMapping( \
const ArgumentMappingContext& ctx) { \ const ArgumentMappingContext& ctx) { \
return KernelSignature(op_name "_grad", \ return KernelSignature(op_name "_grad", \
...@@ -25,7 +25,7 @@ namespace phi { ...@@ -25,7 +25,7 @@ namespace phi {
{GradVarName("X")}); \ {GradVarName("X")}); \
} }
#define DefineActGradDepOutOpArgMap(func_name, op_name, attrs) \ #define DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(func_name, op_name, attrs) \
KernelSignature func_name##GradOpArgumentMapping( \ KernelSignature func_name##GradOpArgumentMapping( \
const ArgumentMappingContext& ctx) { \ const ArgumentMappingContext& ctx) { \
return KernelSignature(op_name "_grad", \ return KernelSignature(op_name "_grad", \
...@@ -36,25 +36,29 @@ namespace phi { ...@@ -36,25 +36,29 @@ namespace phi {
#define comma , #define comma ,
DefineActGradDepXOpArgMap(Cos, "cos", ); // NOLINT DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Cos, "cos", ); // NOLINT
DefineActGradDepXOpArgMap(Tan, "tan", ); // NOLINT DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Tan, "tan", ); // NOLINT
DefineActGradDepXOpArgMap(Acos, "acos", ); // NOLINT DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Acos, "acos", ); // NOLINT
DefineActGradDepXOpArgMap(Sin, "sin", ); // NOLINT DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Sin, "sin", ); // NOLINT
DefineActGradDepXOpArgMap(Asin, "asin", ); // NOLINT DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Asin, "asin", ); // NOLINT
DefineActGradDepXOpArgMap(Atan, "atan", ); // NOLINT DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Atan, "atan", ); // NOLINT
DefineActGradDepXOpArgMap(Sinh, "sinh", ); // NOLINT DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Sinh, "sinh", ); // NOLINT
DefineActGradDepXOpArgMap(Cosh, "cosh", ); // NOLINT DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Cosh, "cosh", ); // NOLINT
DefineActGradDepXOpArgMap(Asinh, "asinh", ); // NOLINT DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Asinh, "asinh", ); // NOLINT
DefineActGradDepXOpArgMap(Acosh, "acosh", ); // NOLINT DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Acosh, "acosh", ); // NOLINT
DefineActGradDepXOpArgMap(Atanh, "atanh", ); // NOLINT DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Atanh, "atanh", ); // NOLINT
DefineActGradDepXOpArgMap(BRelu, "brelu", "t_min" comma "t_max"); // NOLINT DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(BRelu, "brelu", "t_min" comma "t_max");
DefineActGradDepXOpArgMap(LeakyRelu, "leaky_relu", "alpha"); // NOLINT DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(LeakyRelu, "leaky_relu", "alpha");
DefineActGradDepXOpArgMap(ThresholdedRelu, DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(ThresholdedRelu,
"thresholded_relu", "thresholded_relu",
"threshold"); // NOLINT "threshold");
DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(SoftShrink, "soft_shrink", "lambda");
DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(HardShrink, "hard_shrink", "threshold");
DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(TanhShrink, "tanh_shrink", ); // NOLINT
DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Silu, "silu", ); // NOLINT
DefineActGradDepOutOpArgMap(Relu, "relu", ); // NOLINT DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Relu, "relu", ); // NOLINT
DefineActGradDepOutOpArgMap(Tanh, "tanh", ); // NOLINT DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Tanh, "tanh", ); // NOLINT
KernelSignature ReluDoubleGradOpArgumentMapping( KernelSignature ReluDoubleGradOpArgumentMapping(
const ArgumentMappingContext& ctx) { const ArgumentMappingContext& ctx) {
...@@ -85,11 +89,31 @@ KernelSignature LeakyReluOpArgumentMapping(const ArgumentMappingContext& ctx) { ...@@ -85,11 +89,31 @@ KernelSignature LeakyReluOpArgumentMapping(const ArgumentMappingContext& ctx) {
return KernelSignature("leaky_relu", {"X"}, {"alpha"}, {"Out"}); return KernelSignature("leaky_relu", {"X"}, {"alpha"}, {"Out"});
} }
KernelSignature EluOpArgumentMapping(const ArgumentMappingContext& ctx) {
return KernelSignature("elu", {"X"}, {"alpha"}, {"Out"});
}
KernelSignature EluGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
return KernelSignature("elu_grad",
{"X", "Out", GradVarName("Out")},
{"alpha"},
{GradVarName("X")});
}
KernelSignature EluDoubleGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature(
"elu_double_grad", {"X", "DOut", "DDX"}, {"alpha"}, {"DX", "DDOut"});
}
} // namespace phi } // namespace phi
PD_REGISTER_BASE_KERNEL_NAME(relu_grad_grad, relu_double_grad); PD_REGISTER_BASE_KERNEL_NAME(relu_grad_grad, relu_double_grad);
PD_REGISTER_BASE_KERNEL_NAME(tanh_grad_grad, tanh_double_grad); PD_REGISTER_BASE_KERNEL_NAME(tanh_grad_grad, tanh_double_grad);
PD_REGISTER_BASE_KERNEL_NAME(leaky_relu_grad_grad, leaky_relu_double_grad); PD_REGISTER_BASE_KERNEL_NAME(leaky_relu_grad_grad, leaky_relu_double_grad);
PD_REGISTER_BASE_KERNEL_NAME(softshrink, soft_shrink);
PD_REGISTER_BASE_KERNEL_NAME(softshrink_grad, soft_shrink_grad);
PD_REGISTER_BASE_KERNEL_NAME(elu_grad_grad, elu_double_grad);
PD_REGISTER_ARG_MAPPING_FN(cos_grad, phi::CosGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(cos_grad, phi::CosGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(tan_grad, phi::TanGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(tan_grad, phi::TanGradOpArgumentMapping);
...@@ -118,3 +142,13 @@ PD_REGISTER_ARG_MAPPING_FN(leaky_relu_grad_grad, ...@@ -118,3 +142,13 @@ PD_REGISTER_ARG_MAPPING_FN(leaky_relu_grad_grad,
phi::LeakyReluDoubleGradOpArgumentMapping); phi::LeakyReluDoubleGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(thresholded_relu_grad, PD_REGISTER_ARG_MAPPING_FN(thresholded_relu_grad,
phi::ThresholdedReluGradOpArgumentMapping); phi::ThresholdedReluGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(softshrink_grad,
phi::SoftShrinkGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(hard_shrink_grad,
phi::HardShrinkGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(tanh_shrink_grad,
phi::TanhShrinkGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(elu, phi::EluOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(elu_grad, phi::EluGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(elu_grad_grad, phi::EluDoubleGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(silu_grad, phi::SiluGradOpArgumentMapping);
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
...@@ -12,24 +12,32 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,24 +12,32 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/tril_triu_op.h" #include "paddle/phi/core/compat/op_utils.h"
namespace ops = paddle::operators; namespace phi {
namespace plat = paddle::platform;
KernelSignature GridSamplerOpArgumentMapping(
REGISTER_OP_CUDA_KERNEL( const ArgumentMappingContext& ctx) {
tril_triu, ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, bool>, return KernelSignature("grid_sample",
ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, float>, {"X", "Grid"},
ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, double>, {"mode", "padding_mode", "align_corners"},
ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, int>, {"Output"});
ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, int64_t>, }
ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL( KernelSignature GridSamplerGradOpArgumentMapping(
tril_triu_grad, const ArgumentMappingContext& ctx) {
ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, bool>, return KernelSignature("grid_sample_grad",
ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, float>, {"X", "Grid", GradVarName("Output")},
ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, double>, {"mode", "padding_mode", "align_corners"},
ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, int>, {GradVarName("X"), GradVarName("Grid")});
ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, int64_t>, }
ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext,
plat::float16>); } // namespace phi
// use Python API name as kernel name
PD_REGISTER_BASE_KERNEL_NAME(grid_sampler, grid_sample);
PD_REGISTER_BASE_KERNEL_NAME(grid_sampler_grad, grid_sample_grad);
PD_REGISTER_ARG_MAPPING_FN(grid_sampler, phi::GridSamplerOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(grid_sampler_grad,
phi::GridSamplerGradOpArgumentMapping);
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/core/compat/op_utils.h"
namespace phi {
KernelSignature IndexSelectGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature("index_select_grad",
{"X", "Index", GradVarName("Out")},
{"dim"},
{GradVarName("X")});
}
} // namespace phi
PD_REGISTER_ARG_MAPPING_FN(index_select_grad,
phi::IndexSelectGradOpArgumentMapping);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/core/compat/op_utils.h"
namespace phi {
KernelSignature MultiplexOpArgumentMapping(const ArgumentMappingContext& ctx) {
return KernelSignature("multiplex", {"X", "Ids"}, {}, {"Out"});
}
KernelSignature MultiplexGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature(
"multiplex_grad", {"Ids", GradVarName("Out")}, {}, {GradVarName("X")});
}
} // namespace phi
PD_REGISTER_ARG_MAPPING_FN(multiplex, phi::MultiplexOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(multiplex_grad, phi::MultiplexGradOpArgumentMapping);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/core/compat/op_utils.h"
namespace phi {
KernelSignature QrOpArgumentMapping(const ArgumentMappingContext& ctx) {
return KernelSignature("qr", {"X"}, {"mode"}, {"Q", "R"});
}
} // namespace phi
PD_REGISTER_ARG_MAPPING_FN(qr, phi::QrOpArgumentMapping);
...@@ -52,8 +52,19 @@ KernelSignature ReduceMeanOpArgumentMapping(const ArgumentMappingContext& ctx) { ...@@ -52,8 +52,19 @@ KernelSignature ReduceMeanOpArgumentMapping(const ArgumentMappingContext& ctx) {
} }
KernelSignature ReduceProdOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature ReduceProdOpArgumentMapping(const ArgumentMappingContext& ctx) {
if (ctx.IsDenseTensorInput("X")) {
bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
// When ctx is InferShapeArgumentMappingContext, the reduce_all is used in
// InferShape, so we must return the "max_raw" KernelSignature.
// And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with
// the "max_raw" KernelSignature
if (ctx.IsForInferShape() || reduce_all) {
return KernelSignature( return KernelSignature(
"reduce_prod", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"}); "prod_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
}
return KernelSignature("prod", {"X"}, {"dim", "keep_dim"}, {"Out"});
}
return KernelSignature("unregistered", {}, {}, {});
} }
KernelSignature ReduceMaxOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature ReduceMaxOpArgumentMapping(const ArgumentMappingContext& ctx) {
...@@ -107,10 +118,6 @@ KernelSignature ReduceAnyOpArgumentMapping(const ArgumentMappingContext& ctx) { ...@@ -107,10 +118,6 @@ KernelSignature ReduceAnyOpArgumentMapping(const ArgumentMappingContext& ctx) {
KernelSignature ReduceAllOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature ReduceAllOpArgumentMapping(const ArgumentMappingContext& ctx) {
if (ctx.IsDenseTensorInput("X")) { if (ctx.IsDenseTensorInput("X")) {
bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all")); bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
// When ctx is InferShapeArgumentMappingContext, the reduce_all is used in
// InferShape, so we must return the "all_raw" KernelSignature.
// And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with
// the "all_raw" KernelSignature
if (ctx.IsForInferShape() || reduce_all) { if (ctx.IsForInferShape() || reduce_all) {
return KernelSignature( return KernelSignature(
"all_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"}); "all_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
...@@ -135,6 +142,7 @@ PD_REGISTER_BASE_KERNEL_NAME(reduce_sum, sum); ...@@ -135,6 +142,7 @@ PD_REGISTER_BASE_KERNEL_NAME(reduce_sum, sum);
PD_REGISTER_BASE_KERNEL_NAME(reduce_mean, mean); PD_REGISTER_BASE_KERNEL_NAME(reduce_mean, mean);
PD_REGISTER_BASE_KERNEL_NAME(reduce_max, max); PD_REGISTER_BASE_KERNEL_NAME(reduce_max, max);
PD_REGISTER_BASE_KERNEL_NAME(reduce_min, min); PD_REGISTER_BASE_KERNEL_NAME(reduce_min, min);
PD_REGISTER_BASE_KERNEL_NAME(reduce_prod, prod);
PD_REGISTER_BASE_KERNEL_NAME(reduce_all, all); PD_REGISTER_BASE_KERNEL_NAME(reduce_all, all);
PD_REGISTER_BASE_KERNEL_NAME(reduce_any, any); PD_REGISTER_BASE_KERNEL_NAME(reduce_any, any);
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
namespace phi { namespace phi {
KernelSignature ROIAlignOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature RoiAlignOpArgumentMapping(const ArgumentMappingContext& ctx) {
return KernelSignature("roi_align", return KernelSignature("roi_align",
{"X", "ROIs", "RoisNum"}, {"X", "ROIs", "RoisNum"},
{"pooled_height", {"pooled_height",
...@@ -27,6 +27,19 @@ KernelSignature ROIAlignOpArgumentMapping(const ArgumentMappingContext& ctx) { ...@@ -27,6 +27,19 @@ KernelSignature ROIAlignOpArgumentMapping(const ArgumentMappingContext& ctx) {
{"Out"}); {"Out"});
} }
KernelSignature RoiAlignGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature("roi_align_grad",
{"X", "ROIs", "RoisNum", GradVarName("Out")},
{"pooled_height",
"pooled_width",
"spatial_scale",
"sampling_ratio",
"aligned"},
{GradVarName("X")});
}
} // namespace phi } // namespace phi
PD_REGISTER_ARG_MAPPING_FN(roi_align, phi::ROIAlignOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(roi_align, phi::RoiAlignOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(roi_align_grad, phi::RoiAlignGradOpArgumentMapping);
...@@ -12,26 +12,25 @@ ...@@ -12,26 +12,25 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/phi/kernels/reduce_any_kernel.h" #include "paddle/phi/core/compat/op_utils.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi { namespace phi {
template <typename T, typename Context> KernelSignature RollOpArgumentMapping(const ArgumentMappingContext& ctx) {
void AnyKernel(const Context& dev_ctx, if (ctx.HasInput("ShiftsTensor")) {
const DenseTensor& x, return KernelSignature("roll", {"X"}, {"ShiftsTensor", "axis"}, {"Out"});
const std::vector<int64_t>& dims, }
bool keep_dim, return KernelSignature("roll", {"X"}, {"shifts", "axis"}, {"Out"});
DenseTensor* out) { }
bool reduce_all = false;
AnyRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out); KernelSignature RollGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
return KernelSignature("roll_grad",
{"X", GradVarName("Out")},
{"shifts", "axis"},
{GradVarName("X")});
} }
} // namespace phi } // namespace phi
PD_REGISTER_KERNEL(any, CPU, ALL_LAYOUT, phi::AnyKernel, bool) {} PD_REGISTER_ARG_MAPPING_FN(roll, phi::RollOpArgumentMapping);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PD_REGISTER_ARG_MAPPING_FN(roll_grad, phi::RollGradOpArgumentMapping);
PD_REGISTER_KERNEL(any, GPU, ALL_LAYOUT, phi::AnyKernel, bool) {}
#endif
...@@ -20,6 +20,11 @@ KernelSignature TileOpArgumentMapping(const ArgumentMappingContext& ctx) { ...@@ -20,6 +20,11 @@ KernelSignature TileOpArgumentMapping(const ArgumentMappingContext& ctx) {
if (ctx.HasInput("RepeatTimes")) { if (ctx.HasInput("RepeatTimes")) {
return KernelSignature("tile", {"X"}, {"RepeatTimes"}, {"Out"}); return KernelSignature("tile", {"X"}, {"RepeatTimes"}, {"Out"});
} else if (ctx.InputSize("repeat_times_tensor") > 0) { } else if (ctx.InputSize("repeat_times_tensor") > 0) {
const auto& repeat_times =
paddle::any_cast<std::vector<int>>(ctx.Attr("repeat_times"));
if (!ctx.IsRuntime() && !repeat_times.empty()) {
return KernelSignature("tile", {"X"}, {"repeat_times"}, {"Out"});
}
return KernelSignature("tile", {"X"}, {"repeat_times_tensor"}, {"Out"}); return KernelSignature("tile", {"X"}, {"repeat_times_tensor"}, {"Out"});
} else { } else {
return KernelSignature("tile", {"X"}, {"repeat_times"}, {"Out"}); return KernelSignature("tile", {"X"}, {"repeat_times"}, {"Out"});
......
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/core/compat/op_utils.h"
namespace phi {
KernelSignature TrilTriuOpArgumentMapping(const ArgumentMappingContext& ctx) {
return KernelSignature("tril_triu", {"X"}, {"diagonal", "lower"}, {"Out"});
}
KernelSignature TrilTriuGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature("tril_triu_grad",
{GradVarName("Out")},
{"diagonal", "lower"},
{GradVarName("X")});
}
} // namespace phi
PD_REGISTER_ARG_MAPPING_FN(tril_triu, phi::TrilTriuOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(tril_triu_grad, phi::TrilTriuGradOpArgumentMapping);
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <memory> #include <memory>
#include "paddle/phi/kernels/math_kernel.h" #include "paddle/phi/kernels/reduce_kernel.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/phi/api/lib/utils/allocator.h" #include "paddle/phi/api/lib/utils/allocator.h"
......
...@@ -132,11 +132,12 @@ void TestConv3dBase(const std::vector<int>& indices, ...@@ -132,11 +132,12 @@ void TestConv3dBase(const std::vector<int>& indices,
f_verify(out.non_zero_elements().data<T>(), correct_out_features); f_verify(out.non_zero_elements().data<T>(), correct_out_features);
if (backward) { if (backward) {
std::vector<DenseTensor> grads = sparse::Conv3dGrad<T>(dev_ctx_cpu, std::vector<DenseTensor> grads =
sparse::Conv3dGrad<T>(dev_ctx_cpu,
x_tensor, x_tensor,
rulebook, rulebook,
kernel_tensor, kernel_tensor,
out, out.non_zero_elements(),
paddings, paddings,
dilations, dilations,
strides, strides,
...@@ -231,11 +232,12 @@ void TestConv3dBase(const std::vector<int>& indices, ...@@ -231,11 +232,12 @@ void TestConv3dBase(const std::vector<int>& indices,
f_verify(h_features_tensor.data<T>(), correct_out_features); f_verify(h_features_tensor.data<T>(), correct_out_features);
if (backward) { if (backward) {
std::vector<DenseTensor> grads = sparse::Conv3dGrad<T>(dev_ctx_gpu, std::vector<DenseTensor> grads =
sparse::Conv3dGrad<T>(dev_ctx_gpu,
d_x_tensor, d_x_tensor,
d_rulebook, d_rulebook,
d_kernel_tensor, d_kernel_tensor,
d_out, d_out.non_zero_elements(),
paddings, paddings,
dilations, dilations,
strides, strides,
......
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <memory> #include <memory>
#include "paddle/phi/kernels/math_kernel.h" #include "paddle/phi/kernels/reduce_kernel.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/phi/api/lib/utils/allocator.h" #include "paddle/phi/api/lib/utils/allocator.h"
......
...@@ -21,11 +21,12 @@ from paddle.fluid import framework ...@@ -21,11 +21,12 @@ from paddle.fluid import framework
from .utils import print_program_with_dist_attr from .utils import print_program_with_dist_attr
from .operators import find_best_compatible_distributed_operator_impl from .operators import find_best_compatible_distributed_operator_impl
from .dist_context import get_default_distributed_context from .dist_context import get_default_distributed_context, _node_id
from .dist_tensor import DistributedTensor from .dist_tensor import DistributedTensor
from .dist_op import DistributedOperator from .dist_op import DistributedOperator
from .dist_attribute import TensorDistributedAttribute from .dist_attribute import TensorDistributedAttribute
from .dist_attribute import OperatorDistributedAttribute from .dist_attribute import OperatorDistributedAttribute
from .process_mesh import ProcessMesh
from paddle.distributed.fleet.meta_optimizers.common import OpRole from paddle.distributed.fleet.meta_optimizers.common import OpRole
...@@ -108,6 +109,20 @@ def compute_compatible_dims_mapping(dims_mapping_list): ...@@ -108,6 +109,20 @@ def compute_compatible_dims_mapping(dims_mapping_list):
return compatible_result return compatible_result
def merge_process_mesh_two(pm1, pm2):
process_set1 = set()
process_set2 = set()
if pm1 is None and pm2 is None:
return None
if pm1 is not None:
process_set1 = set(pm1.processes)
if pm2 is not None:
process_set2 = set(pm2.processes)
merged_process_set = process_set1.union(process_set2)
merged_process_mesh = ProcessMesh(list(merged_process_set))
return merged_process_mesh
class Completer: class Completer:
def __init__(self, dist_context): def __init__(self, dist_context):
assert dist_context is not None assert dist_context is not None
...@@ -119,7 +134,9 @@ class Completer: ...@@ -119,7 +134,9 @@ class Completer:
return False return False
tensor_desc = tensor_node.var() tensor_desc = tensor_node.var()
# Skip reader tensor # Skip reader tensor
if tensor_desc.type() == core.VarDesc.VarType.READER: if tensor_desc.type() == core.VarDesc.VarType.READER \
or tensor_desc.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
or tensor_desc.type == core.VarDesc.VarType.STEP_SCOPES:
return False return False
tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph( tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph(
tensor_node) tensor_node)
...@@ -185,7 +202,7 @@ class Completer: ...@@ -185,7 +202,7 @@ class Completer:
op_dist_attr = dist_op.dist_attr op_dist_attr = dist_op.dist_attr
if fwd: if fwd:
for tensor_node in op_node.inputs: for tensor_node in op_node.inputs:
if tensor_node.var() is not None: if tensor_node.is_var() and tensor_node.var() is not None:
if tensor_node.var().type() == core.VarDesc.VarType.READER: if tensor_node.var().type() == core.VarDesc.VarType.READER:
continue continue
tensor_desc = tensor_node.var() tensor_desc = tensor_node.var()
...@@ -208,7 +225,7 @@ class Completer: ...@@ -208,7 +225,7 @@ class Completer:
# Find the most compatible implemenetations from the distributed operator # Find the most compatible implemenetations from the distributed operator
op_dist_impl = find_best_compatible_distributed_operator_impl( op_dist_impl = find_best_compatible_distributed_operator_impl(
dist_op, fwd=True) dist_op, fwd=True)
assert op_dist_impl is not None, "Cannot find the dist op implementation." if op_dist_impl is not None:
dim_changed = op_dist_impl.update_dims_mapping(dist_op) dim_changed = op_dist_impl.update_dims_mapping(dist_op)
if dim_changed: if dim_changed:
changed = True changed = True
...@@ -220,7 +237,7 @@ class Completer: ...@@ -220,7 +237,7 @@ class Completer:
op_dist_attr.impl_idx = op_dist_impl.idx op_dist_attr.impl_idx = op_dist_impl.idx
else: else:
for tensor_node in op_node.outputs: for tensor_node in op_node.outputs:
if tensor_node.var() is not None: if tensor_node.is_var() and tensor_node.var() is not None:
if tensor_node.var().type() == core.VarDesc.VarType.READER: if tensor_node.var().type() == core.VarDesc.VarType.READER:
continue continue
tensor_desc = tensor_node.var() tensor_desc = tensor_node.var()
...@@ -243,7 +260,7 @@ class Completer: ...@@ -243,7 +260,7 @@ class Completer:
# Find the most compatible implemenetations from the distributed operator # Find the most compatible implemenetations from the distributed operator
op_dist_impl = find_best_compatible_distributed_operator_impl( op_dist_impl = find_best_compatible_distributed_operator_impl(
dist_op, fwd=False) dist_op, fwd=False)
assert op_dist_impl is not None, "Cannot find the dist op implementation." if op_dist_impl is not None:
dim_changed = op_dist_impl.update_dims_mapping(dist_op) dim_changed = op_dist_impl.update_dims_mapping(dist_op)
if dim_changed: if dim_changed:
changed = True changed = True
...@@ -255,49 +272,26 @@ class Completer: ...@@ -255,49 +272,26 @@ class Completer:
op_dist_attr.impl_idx = op_dist_impl.idx op_dist_attr.impl_idx = op_dist_impl.idx
return changed return changed
def _update_process_mesh(self): def _update_dims_mapping_between_graphs(self):
def _find_nearset_node(nodes, idx):
for node in reversed(nodes[:idx]):
node_dist_attr = self._dist_context.get_dist_attr_for_graph(
node)
if node_dist_attr.process_mesh is not None:
return node
total_reach_fix_point = False
while not total_reach_fix_point:
total_changed = False
for is_fwd in [True, False]:
all_nodes = self._dist_context.serial_ordered_nodes \
if is_fwd else reversed(self._dist_context.serial_ordered_nodes)
reach_fix_point = False
while not reach_fix_point:
changed = False changed = False
for idx, node in enumerate(all_nodes): for parent_node, child_node in self._node_pairs_between_graphs:
nearest_node = _find_nearset_node( parent_node_dist_attr = self._dist_context.get_dist_attr_for_graph(
self._dist_context.serial_ordered_nodes, idx) parent_node)
if nearest_node is None: child_node_dist_attr = self._dist_context.get_dist_attr_for_graph(
continue child_node)
nearest_node_dis_attr = self._dist_context.get_dist_attr_for_graph( parent_node_dims_mapping = parent_node_dist_attr.dims_mapping
nearest_node) child_node_dims_mapping = child_node_dist_attr.dims_mapping
nearest_process_mesh = nearest_node_dis_attr.process_mesh compatible_dims_mapping = compute_compatible_dims_mapping(
cur_node_dist_attr = self._dist_context.get_dist_attr_for_graph( [parent_node_dims_mapping, child_node_dims_mapping])
node) if (compatible_dims_mapping is not None) \
cur_process_mesh = cur_node_dist_attr.process_mesh and (compatible_dims_mapping != parent_node_dims_mapping):
compatible_process_mesh = compute_compatible_process_mesh( parent_node_dist_attr.dims_mapping = compatible_dims_mapping
[cur_process_mesh, nearest_process_mesh])
if compatible_process_mesh is not None \
and cur_process_mesh != compatible_process_mesh:
cur_node_dist_attr.process_mesh = compatible_process_mesh
changed = True changed = True
if changed: if (compatible_dims_mapping is not None) \
reach_fix_point = False and (compatible_dims_mapping != child_node_dims_mapping):
total_changed = True parent_node_dist_attr.dims_mapping = compatible_dims_mapping
else: changed = True
reach_fix_point = True return changed
if total_changed:
total_reach_fix_point = False
else:
total_reach_fix_point = True
def _update_dims_mapping(self): def _update_dims_mapping(self):
# Complete dims_mapping for each node # Complete dims_mapping for each node
...@@ -318,11 +312,314 @@ class Completer: ...@@ -318,11 +312,314 @@ class Completer:
node, fwd=is_fwd) node, fwd=is_fwd)
if op_changed: if op_changed:
changed = True changed = True
graph_changed = self._update_dims_mapping_between_graphs()
if graph_changed:
changed = True
if changed: if changed:
reach_fix_point = False reach_fix_point = False
else: else:
reach_fix_point = True reach_fix_point = True
def _update_process_mesh_by_nearest(self, op_node, nearest_op_node):
op_dist_attr = self._dist_context.get_dist_attr_for_graph(op_node)
# Set the process mesh of the op node by its nearest op node
if not op_dist_attr.is_annotated("process_mesh"):
process_mesh = op_dist_attr.process_mesh
nearest_op_dis_attr = self._dist_context.get_dist_attr_for_graph(
nearest_op_node)
nearest_process_mesh = nearest_op_dis_attr.process_mesh
compatible_process_mesh = compute_compatible_process_mesh(
[process_mesh, nearest_process_mesh])
if compatible_process_mesh is not None \
and process_mesh != compatible_process_mesh:
op_dist_attr.process_mesh = compatible_process_mesh
# Skip the process_mesh setting of inputs and outputs of while_op
if op_dist_attr.op_type == "while":
return
# Set the process mesh of the op node's leaf-inputs
for tensor_node in op_node.inputs:
if tensor_node.is_var() and tensor_node.var() is not None:
tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph(
tensor_node)
if tensor_dist_attr.is_annotated("process_mesh"):
continue
# Skip the non-leaf var node
if len(tensor_node.inputs) != 0:
continue
compatible_process_mesh = compute_compatible_process_mesh(
[tensor_dist_attr.process_mesh, op_dist_attr.process_mesh])
if compatible_process_mesh is not None \
and tensor_dist_attr.process_mesh != compatible_process_mesh:
tensor_dist_attr.process_mesh = compatible_process_mesh
# Set the process mesh of the op node's outputs
for tensor_node in op_node.outputs:
if tensor_node.is_var() and tensor_node.var() is not None:
tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph(
tensor_node)
if tensor_dist_attr.is_annotated("process_mesh"):
continue
compatible_process_mesh = compute_compatible_process_mesh(
[tensor_dist_attr.process_mesh, op_dist_attr.process_mesh])
if compatible_process_mesh is not None \
and tensor_dist_attr.process_mesh != compatible_process_mesh:
tensor_dist_attr.process_mesh = compatible_process_mesh
def _update_process_mesh_for_specials(self):
def _find_nearest_tensor_node_before(nodes, idx, var_name):
for node in reversed(nodes[:idx]):
if node.is_var() and node.var() is not None \
and node.var().name() == var_name:
return node
def _find_nearest_tensor_node_after(nodes, idx, var_name):
for node in nodes[idx + 1:]:
if node.is_var() and node.var() is not None \
and node.var().name() == var_name:
return node
def _find_nodes_related_to_cond(source_node):
related_nodes = []
visited = set()
frontier = list()
frontier.append(source_node)
# BFS
while len(frontier) != 0:
cur = frontier[0]
frontier = frontier[1:]
if _node_id(cur) in visited:
continue
# TODO: need more restrictions
for node in cur.inputs:
if node.is_var() and node.var() is not None:
if node.var().type() != core.VarDesc.VarType.READER \
and len(node.var().shape()) == 1:
frontier.append(node)
related_nodes.append(node)
if node.is_op() and node.op() is not None:
flag = True
if node.op().type() == "create_py_reader" \
or node.op().type() == "create_double_buffer_reader" \
or node.op().type() == "read":
flag = False
for tensor_node in node.inputs:
if tensor_node.is_var() and tensor_node.var(
) is not None:
if tensor_node.var().type() == core.VarDesc.VarType.READER \
or len(tensor_node.var().shape()) != 1:
flag = False
break
for tensor_node in node.outputs:
if tensor_node.is_var() and tensor_node.var(
) is not None:
if tensor_node.var().type() == core.VarDesc.VarType.READER \
or len(tensor_node.var().shape()) != 1:
flag = False
break
if flag:
frontier.append(node)
related_nodes.append(node)
visited.add(_node_id(cur))
return related_nodes
# Amend the process meshes related to while_op
for while_op_node, while_op_node_idx in self._while_op_nodes.values():
sub_graph_id = while_op_node.op()._block_attr_id("sub_block")
sub_graph = self._dist_context._serial_graph.get_sub_graph(
sub_graph_id)
sub_graph_nodes = list(sub_graph.all_nodes())
while_dist_op = self._dist_context.get_dist_op_for_graph(
while_op_node)
while_op_dist_attr = while_dist_op.dist_attr
# Step 1: set the process mesh of while_op to the merged process mesh of its subblock
merged_process_mesh = while_op_dist_attr.process_mesh
for node in sub_graph_nodes:
if (node.is_var() and node.var() is not None) \
or (node.is_op() and node.op() is not None):
dist_attr = self._dist_context.get_dist_attr_for_graph(node)
merged_process_mesh = merge_process_mesh_two(
merged_process_mesh, dist_attr.process_mesh)
while_op_dist_attr.process_mesh = merged_process_mesh
# Step 2: set the related nodes of while_op to the process mesh of while_op
# Step 2.1: Find related nodes of cond var the graph of while_op
cond_tensor_related_nodes = []
cond_tensor_name = while_op_node.op().input("Condition")[0]
cond_tensor_node = None
for node in while_op_node.inputs:
if node.is_var() and node.var() is not None \
and node.var().name() == cond_tensor_name:
cond_tensor_node = node
cond_tensor_related_nodes.append(cond_tensor_node)
break
cond_tensor_related_nodes.extend(
_find_nodes_related_to_cond(cond_tensor_node))
# Step 2.2: Find related nodes of cond var in the subgraph of while_op
cond_tensor_node = None
for node in reversed(sub_graph_nodes):
if node.is_var() and node.var() is not None \
and node.var().name() == cond_tensor_name \
and len(node.outputs) == 0:
cond_tensor_node = node
break
cond_tensor_related_nodes.extend(
_find_nodes_related_to_cond(cond_tensor_node))
# Step 2.3: Add the StepScops output of while_op
stepscopes_tensor_name = while_op_node.op().output("StepScopes")[0]
stepscopes_tensor_node = None
for output_node in while_op_node.outputs:
if output_node.is_var() and output_node.var() is not None \
and output_node.var().name() == stepscopes_tensor_name:
stepscopes_tensor_node = output_node
cond_tensor_related_nodes.append(stepscopes_tensor_node)
# Step 2.4: Set the process meshes of all nodes related to cond var to the process mesh of while op
for node in cond_tensor_related_nodes:
tensor_dist_attr = self._dist_context.get_dist_attr_for_graph(
node)
tensor_dist_attr.process_mesh = merged_process_mesh
# Step 3: set the process meshes of the inputs in while_op to the process meshes of the outside input nodes
while_op_inputs_dist_attrs = while_op_dist_attr.inputs_dist_attrs
for tensor_name, tensor_dist_attr in while_op_inputs_dist_attrs.items(
):
nearest_tensor_node = _find_nearest_tensor_node_before(
self._dist_context.serial_ordered_nodes, while_op_node_idx,
tensor_name)
nearest_tensor_dist_attr = self._dist_context.get_dist_attr_for_graph(
nearest_tensor_node)
tensor_dist_attr.process_mesh = nearest_tensor_dist_attr.process_mesh
# Step 4: set the process meshes of the outputs in while_op to the process meshes of the outside output nodes
while_op_outputs_dist_attrs = while_op_dist_attr.outputs_dist_attrs
for tensor_name, tensor_dist_attr in while_op_outputs_dist_attrs.items(
):
nearest_tensor_node = _find_nearest_tensor_node_before(
self._dist_context.serial_ordered_nodes, while_op_node_idx,
tensor_name)
if nearest_tensor_node is None:
nearest_tensor_node = _find_nearest_tensor_node_after(
self._dist_context.serial_ordered_nodes,
while_op_node_idx, tensor_name)
nearest_tensor_dist_attr = self._dist_context.get_dist_attr_for_graph(
nearest_tensor_node)
tensor_dist_attr.process_mesh = nearest_tensor_dist_attr.process_mesh
# Amend the process meshes related to array
for array_node_list in self._array_nodes.values():
merged_process_mesh = None
for array_node in array_node_list:
dist_attr = self._dist_context.get_dist_attr_for_graph(
array_node)
merged_process_mesh = merge_process_mesh_two(
merged_process_mesh, dist_attr.process_mesh)
for array_node in array_node_list:
dist_attr = self._dist_context.get_dist_attr_for_graph(
array_node)
dist_attr.process_mesh = merged_process_mesh
def _update_process_mesh(self):
ordered_op_nodes = self._dist_context._serial_ordered_op_nodes
# Step 1: Set the annotated process meshes from tensors to the first ops using them
ordered_tensor_nodes = self._dist_context._serial_ordered_tensor_nodes
for tensor_node in ordered_tensor_nodes:
tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph(
tensor_node)
if not tensor_dist_attr.is_annotated("process_mesh"):
continue
first_op_node = None
for op_node in ordered_op_nodes:
# TODO: Need a better rule for the control flow ops.
# For now, do not set the process mesh of while_op from its inputs
if op_node.op().type() == "while":
continue
for input_tensor_node in op_node.inputs:
if _node_id(tensor_node) == _node_id(input_tensor_node):
first_op_node = op_node
break
if first_op_node is not None:
break
if first_op_node is None:
continue
op_dist_attr = self._dist_context.get_dist_attr_for_graph(
first_op_node)
if op_dist_attr is not None and not op_dist_attr.is_annotated(
"process_mesh"):
compatible_process_mesh = compute_compatible_process_mesh(
[tensor_dist_attr.process_mesh, op_dist_attr.process_mesh])
if compatible_process_mesh is not None \
and op_dist_attr.process_mesh != compatible_process_mesh:
op_dist_attr.process_mesh = compatible_process_mesh
# Step 2: set the process meshes of ops with the nearest op before them
# Step 2.1: find the first op node which has the process mesh
idx_of_first_op_node_has_process_mesh = -1
for idx, op_node in enumerate(ordered_op_nodes):
op_dist_attr = self._dist_context.get_dist_attr_for_graph(op_node)
if op_dist_attr.process_mesh is not None \
and idx_of_first_op_node_has_process_mesh == -1:
idx_of_first_op_node_has_process_mesh = idx
# Reuse the following method to set the related tensors for same op node
self._update_process_mesh_by_nearest(op_node, op_node)
# Step 2.2: set the process meshes of ops by the nearest op node after the first op node
if idx_of_first_op_node_has_process_mesh + 1 > len(ordered_op_nodes):
return None
for idx, op_node in enumerate(ordered_op_nodes[
idx_of_first_op_node_has_process_mesh + 1:]):
original_idx = idx_of_first_op_node_has_process_mesh + +idx + 1
nearest_op_node = ordered_op_nodes[original_idx - 1]
nearest_op_dist_attr = self._dist_context.get_dist_attr_for_graph(
nearest_op_node)
op_dist_attr = self._dist_context.get_dist_attr_for_graph(op_node)
assert nearest_op_dist_attr.process_mesh is not None
self._update_process_mesh_by_nearest(op_node, nearest_op_node)
# Step 2.3: set the process meshes of ops by the nearest op node before the first op node
nearest_op_node = ordered_op_nodes[
idx_of_first_op_node_has_process_mesh]
for op_node in ordered_op_nodes[:idx_of_first_op_node_has_process_mesh]:
self._update_process_mesh_by_nearest(op_node, nearest_op_node)
# Step 3: adjust the process meshes for special ops
self._update_process_mesh_for_specials()
def _prepare(self):
self._while_op_nodes = {}
self._array_nodes = {}
self._node_pairs_between_graphs = []
all_nodes = self._dist_context.serial_ordered_nodes
for idx, node in enumerate(all_nodes):
if node.is_op():
if node.op().type() == "while":
self._while_op_nodes[_node_id(node)] = (node, idx)
if node.op().type() == "read_from_array":
array_var_name = node.op().input("X")[0]
if self._array_nodes.get(array_var_name, None) is None:
self._array_nodes[array_var_name] = []
self._array_nodes[array_var_name].append(node)
if node.op().type() == "write_to_array":
array_var_name = node.op().output("Out")[0]
if self._array_nodes.get(array_var_name, None) is None:
self._array_nodes[array_var_name] = []
self._array_nodes[array_var_name].append(node)
self._array_nodes[array_var_name].append(node.outputs[0])
if node.is_var() and node.var() is not None:
if node.node.graph_id() != 0:
for before_node in reversed(all_nodes[:idx]):
if before_node.is_var() and before_node.var() is not None \
and before_node.node.graph_id() == node.node.graph_id() - 1 \
and before_node.var().name() == node.var().name():
self._node_pairs_between_graphs.append(
(before_node, node))
for after_node in all_nodes[idx + 1:]:
if after_node.is_var() and after_node.var() is not None \
and after_node.node.graph_id() == node.node.graph_id() - 1 \
and after_node.var().name() == node.var().name():
self._node_pairs_between_graphs.append(
(after_node, node))
def complete_forward_annotation(self, serial_main_program): def complete_forward_annotation(self, serial_main_program):
""" Complete annotation for the partial annotated serial_main_program. """ Complete annotation for the partial annotated serial_main_program.
Arguments: Arguments:
...@@ -336,24 +633,24 @@ class Completer: ...@@ -336,24 +633,24 @@ class Completer:
# Initialize distributed attributes for all var and op node in serial_main_program # Initialize distributed attributes for all var and op node in serial_main_program
self._dist_context.init_dist_attr_for_program() self._dist_context.init_dist_attr_for_program()
# print_program_with_dist_attr(serial_main_program, self._dist_context)
# Initialize distributed attributes for all var and op node in graph # Initialize distributed attributes for all var and op node in graph
self._dist_context.init_dist_attr_for_graph() self._dist_context.init_dist_attr_for_graph()
self._prepare()
self._update_process_mesh() self._update_process_mesh()
# Complete dims_mapping for each node
self._update_dims_mapping() self._update_dims_mapping()
# Copy the corresponding distributed attribute from graph to serial_main_program # Copy the corresponding distributed attribute from graph to serial_main_program
self._dist_context.copy_dist_attr_from_graph_to_program() self._dist_context.copy_dist_attr_from_graph_to_program()
self._dist_context.clear_dist_info_for_graph() self._dist_context.clear_dist_info_for_graph()
# print_serial_main_program_with_dist_attr(serial_main_program, self._dist_context)
# Do the validation check and amend some completion # Do the validation check and amend some completion
self._dist_context.amend_dist_attr_for_program() self._dist_context.amend_dist_attr_for_program()
# print_serial_main_program_with_dist_attr(serial_main_program, self._dist_context)
self._dist_context.validate_dist_attr_for_program() self._dist_context.validate_dist_attr_for_program()
return serial_main_program return serial_main_program
......
...@@ -175,6 +175,7 @@ class TensorDistributedAttribute: ...@@ -175,6 +175,7 @@ class TensorDistributedAttribute:
class OperatorDistributedAttribute: class OperatorDistributedAttribute:
def __init__(self): def __init__(self):
self._process_mesh = None self._process_mesh = None
self._op_type = None
self._impl_type = None self._impl_type = None
self._impl_idx = None self._impl_idx = None
self._inputs_dist_attrs = {} self._inputs_dist_attrs = {}
...@@ -194,11 +195,23 @@ class OperatorDistributedAttribute: ...@@ -194,11 +195,23 @@ class OperatorDistributedAttribute:
if isinstance(process_mesh, list): if isinstance(process_mesh, list):
process_mesh = ProcessMesh(process_mesh) process_mesh = ProcessMesh(process_mesh)
self._process_mesh = copy.deepcopy(process_mesh) self._process_mesh = copy.deepcopy(process_mesh)
# In while op, the proess mesh is not shared by all inputs and outputs
if self._op_type == "while":
return None
for dist_attr in self._inputs_dist_attrs.values(): for dist_attr in self._inputs_dist_attrs.values():
dist_attr.process_mesh = process_mesh dist_attr.process_mesh = process_mesh
for dist_attr in self._outputs_dist_attrs.values(): for dist_attr in self._outputs_dist_attrs.values():
dist_attr.process_mesh = process_mesh dist_attr.process_mesh = process_mesh
@property
def op_type(self):
return self._op_type
@op_type.setter
def op_type(self, op_type):
if op_type is not None:
self._op_type = op_type
@property @property
def impl_type(self): def impl_type(self):
return self._impl_type return self._impl_type
...@@ -326,6 +339,8 @@ class OperatorDistributedAttribute: ...@@ -326,6 +339,8 @@ class OperatorDistributedAttribute:
assert False, "No setter for {} in args {}.".format( assert False, "No setter for {} in args {}.".format(
key, dist_attr) key, dist_attr)
# Make sure proscess_meshes in dist op be same # Make sure proscess_meshes in dist op be same
if self.op_type == "while":
return None
process_meshes = [] process_meshes = []
process_meshes.append(self.process_mesh) process_meshes.append(self.process_mesh)
for tensor_dist_attr in self.inputs_dist_attrs.values(): for tensor_dist_attr in self.inputs_dist_attrs.values():
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
import copy import copy
from collections import defaultdict from collections import defaultdict
from paddle.fluid import framework from paddle.fluid import framework
from paddle.fluid.framework import get_flags, set_flags
from paddle.fluid import core from paddle.fluid import core
from .dist_attribute import TensorDistributedAttribute from .dist_attribute import TensorDistributedAttribute
from .dist_attribute import OperatorDistributedAttribute from .dist_attribute import OperatorDistributedAttribute
...@@ -39,6 +40,10 @@ def set_default_distributed_context(dist_context): ...@@ -39,6 +40,10 @@ def set_default_distributed_context(dist_context):
_g_default_distributed_context = dist_context _g_default_distributed_context = dist_context
def _node_id(node):
return (node.node.graph_id(), node.node.id())
class DistributedContext: class DistributedContext:
""" """
DistributedContext is used to collect related distributed information for program and graph. DistributedContext is used to collect related distributed information for program and graph.
...@@ -146,7 +151,7 @@ class DistributedContext: ...@@ -146,7 +151,7 @@ class DistributedContext:
return None return None
def get_dist_tensor_for_graph(self, serial_tensor_node): def get_dist_tensor_for_graph(self, serial_tensor_node):
serial_tensor_node_id = serial_tensor_node.id() serial_tensor_node_id = _node_id(serial_tensor_node)
return self._dist_tensors_for_graph.get(serial_tensor_node_id, None) return self._dist_tensors_for_graph.get(serial_tensor_node_id, None)
def get_dist_op_for_program(self, serial_op): def get_dist_op_for_program(self, serial_op):
...@@ -168,7 +173,7 @@ class DistributedContext: ...@@ -168,7 +173,7 @@ class DistributedContext:
del self._dist_ops_for_program[serial_tensor_id] del self._dist_ops_for_program[serial_tensor_id]
def get_dist_op_for_graph(self, serial_op_node): def get_dist_op_for_graph(self, serial_op_node):
serial_op_node_id = serial_op_node.id() serial_op_node_id = _node_id(serial_op_node)
return self._dist_ops_for_graph.get(serial_op_node_id, None) return self._dist_ops_for_graph.get(serial_op_node_id, None)
def get_tensor_dist_attr_for_program(self, serial_tensor): def get_tensor_dist_attr_for_program(self, serial_tensor):
...@@ -197,7 +202,7 @@ class DistributedContext: ...@@ -197,7 +202,7 @@ class DistributedContext:
self.add_dist_tensor_for_program(dist_tensor) self.add_dist_tensor_for_program(dist_tensor)
def get_tensor_dist_attr_for_graph(self, serial_tensor_node): def get_tensor_dist_attr_for_graph(self, serial_tensor_node):
serial_tensor_node_id = serial_tensor_node.id() serial_tensor_node_id = _node_id(serial_tensor_node)
dist_tensor = self._dist_tensors_for_graph.get(serial_tensor_node_id, dist_tensor = self._dist_tensors_for_graph.get(serial_tensor_node_id,
None) None)
if dist_tensor: if dist_tensor:
...@@ -242,7 +247,7 @@ class DistributedContext: ...@@ -242,7 +247,7 @@ class DistributedContext:
self.add_dist_op_for_program(dist_op) self.add_dist_op_for_program(dist_op)
def get_op_dist_attr_for_graph(self, serial_op_node): def get_op_dist_attr_for_graph(self, serial_op_node):
serial_op_node_id = serial_op_node.id() serial_op_node_id = _node_id(serial_op_node)
dist_op = self._dist_ops_for_graph.get(serial_op_node_id, None) dist_op = self._dist_ops_for_graph.get(serial_op_node_id, None)
if dist_op: if dist_op:
return dist_op.dist_attr return dist_op.dist_attr
...@@ -262,7 +267,7 @@ class DistributedContext: ...@@ -262,7 +267,7 @@ class DistributedContext:
def get_dist_attr_for_graph(self, serial_node): def get_dist_attr_for_graph(self, serial_node):
if serial_node.is_var() and serial_node.var() is not None: if serial_node.is_var() and serial_node.var() is not None:
serial_tensor_node_id = serial_node.id() serial_tensor_node_id = _node_id(serial_node)
dist_tensor = self._dist_tensors_for_graph.get( dist_tensor = self._dist_tensors_for_graph.get(
serial_tensor_node_id, None) serial_tensor_node_id, None)
if dist_tensor: if dist_tensor:
...@@ -270,7 +275,7 @@ class DistributedContext: ...@@ -270,7 +275,7 @@ class DistributedContext:
else: else:
return None return None
if serial_node.is_op() and serial_node.op() is not None: if serial_node.is_op() and serial_node.op() is not None:
serial_op_node_id = serial_node.id() serial_op_node_id = _node_id(serial_node)
dist_op = self._dist_ops_for_graph.get(serial_op_node_id, None) dist_op = self._dist_ops_for_graph.get(serial_op_node_id, None)
if dist_op: if dist_op:
return dist_op.dist_attr return dist_op.dist_attr
...@@ -311,40 +316,69 @@ class DistributedContext: ...@@ -311,40 +316,69 @@ class DistributedContext:
def order_nodes_by_program_order(self): def order_nodes_by_program_order(self):
def _contains(nodes, target_node): def _contains(nodes, target_node):
for node in nodes: for node in nodes:
if node.id() == target_node.id(): if _node_id(node) == _node_id(target_node):
return True return True
return False return False
ordered_tensor_nodes = [] serial_ordered_tensor_nodes = []
ordered_op_nodes = [] serial_ordered_op_nodes = []
all_nodes = self._serial_graph.all_nodes() all_nodes = []
# for idx, graph in enumerate(self._serial_graph.all_sub_graphs()):
for idx, graph in enumerate(self._serial_graph.all_sub_graphs()):
for node in graph.all_nodes():
all_nodes.append(node)
for node in all_nodes: for node in all_nodes:
if node.is_var() and node.var() is not None: if node.is_var() and node.var() is not None:
ordered_tensor_nodes.append(node) serial_ordered_tensor_nodes.append(node)
if node.is_op() and node.op() is not None: if node.is_op() and node.op() is not None:
ordered_op_nodes.append(node) serial_ordered_op_nodes.append(node)
ordered_tensor_nodes.sort(key=lambda node: node.node.original_desc_id()) serial_ordered_tensor_nodes.sort(
ordered_op_nodes.sort(key=lambda node: node.node.original_desc_id()) key=lambda node: node.node.original_desc_id())
for op_node in ordered_op_nodes: serial_ordered_op_nodes.sort(
key=lambda node: node.node.original_desc_id())
num_nodes_before = len(serial_ordered_tensor_nodes) + len(
serial_ordered_op_nodes)
new_serial_ordered_tensor_nodes = []
new_serial_ordered_op_nodes = []
for op_node in serial_ordered_op_nodes:
tensor_nodes = [] tensor_nodes = []
for tensor_node in op_node.inputs: for tensor_node in op_node.inputs:
if tensor_node.is_var() \ if tensor_node.is_var() \
and tensor_node.var() is not None \ and tensor_node.var() is not None \
and not _contains(self._serial_ordered_nodes, tensor_node): and not _contains(self._serial_ordered_nodes, tensor_node):
tensor_nodes.append(tensor_node) tensor_nodes.append(tensor_node)
new_serial_ordered_tensor_nodes.append(tensor_node)
tensor_nodes.sort(key=lambda node: node.node.original_desc_id()) tensor_nodes.sort(key=lambda node: node.node.original_desc_id())
self._serial_ordered_nodes.extend(tensor_nodes) self._serial_ordered_nodes.extend(tensor_nodes)
self._serial_ordered_nodes.append(op_node) self._serial_ordered_nodes.append(op_node)
new_serial_ordered_op_nodes.append(op_node)
tensor_nodes = [] tensor_nodes = []
for tensor_node in op_node.outputs: for tensor_node in op_node.outputs:
if tensor_node.is_var() \ if tensor_node.is_var() \
and tensor_node.var() is not None \ and tensor_node.var() is not None \
and not _contains(self._serial_ordered_nodes, tensor_node): and not _contains(self._serial_ordered_nodes, tensor_node):
tensor_nodes.append(tensor_node) tensor_nodes.append(tensor_node)
new_serial_ordered_tensor_nodes.append(tensor_node)
tensor_nodes.sort(key=lambda node: node.node.original_desc_id())
self._serial_ordered_nodes.extend(tensor_nodes) self._serial_ordered_nodes.extend(tensor_nodes)
num_nodes_before = len(ordered_tensor_nodes) + len(ordered_op_nodes) new_serial_ordered_tensor_nodes.sort(
assert len(self._serial_ordered_nodes) == num_nodes_before, \ key=lambda node: node.node.original_desc_id())
"The number of nodes before ordering is not the same after ordering." new_serial_ordered_op_nodes.sort(
key=lambda node: node.node.original_desc_id())
self._serial_ordered_tensor_nodes = new_serial_ordered_tensor_nodes
self._serial_ordered_op_nodes = new_serial_ordered_op_nodes
assert len(self._serial_ordered_nodes) == len(
self._serial_ordered_tensor_nodes) + len(
self._serial_ordered_op_nodes)
self._serial_orphan_tensor_nodes = []
for tensor_node in serial_ordered_tensor_nodes:
if not _contains(self._serial_ordered_tensor_nodes, tensor_node):
self._serial_orphan_tensor_nodes.append(tensor_node)
if len(self._serial_ordered_nodes) != num_nodes_before:
print(
"WARNING: there are some orphan tensors or ops which are not used in the execution."
)
def init_dist_attr_for_graph(self): def init_dist_attr_for_graph(self):
assert self._is_initialized_for_program, \ assert self._is_initialized_for_program, \
...@@ -352,9 +386,9 @@ class DistributedContext: ...@@ -352,9 +386,9 @@ class DistributedContext:
if self._is_initialized_for_graph: if self._is_initialized_for_graph:
return return
# Convert program to graph # Convert program to graph
set_flags({"FLAGS_convert_all_blocks": True})
self._serial_graph = framework.IrGraph( self._serial_graph = framework.IrGraph(
core.Graph(self._serial_program.desc)) core.Graph(self._serial_program.desc))
all_nodes = self._serial_graph.all_nodes()
self.order_nodes_by_program_order() self.order_nodes_by_program_order()
for node in self.serial_ordered_nodes: for node in self.serial_ordered_nodes:
if node.is_var() and node.var() is not None: if node.is_var() and node.var() is not None:
...@@ -365,10 +399,11 @@ class DistributedContext: ...@@ -365,10 +399,11 @@ class DistributedContext:
if tensor_id == cur_tensor_id \ if tensor_id == cur_tensor_id \
or tensor_id == cur_dist_tensor.serial_tensor.desc.original_id(): or tensor_id == cur_dist_tensor.serial_tensor.desc.original_id():
dist_tensor = cur_dist_tensor dist_tensor = cur_dist_tensor
self._node_id_to_tensor_id[node.id()] = cur_tensor_id self._node_id_to_tensor_id[_node_id(
node)] = cur_tensor_id
assert dist_tensor is not None, \ assert dist_tensor is not None, \
"Tensor must have a distributed tensor after the initialization for program." "Tensor must have a distributed tensor after the initialization for program."
serial_tensor_node_id = node.id() serial_tensor_node_id = _node_id(node)
new_dist_tensor = DistributedTensor(dist_tensor.serial_tensor, new_dist_tensor = DistributedTensor(dist_tensor.serial_tensor,
dist_tensor.dist_attr) dist_tensor.dist_attr)
self._dist_tensors_for_graph[ self._dist_tensors_for_graph[
...@@ -381,10 +416,10 @@ class DistributedContext: ...@@ -381,10 +416,10 @@ class DistributedContext:
if op_id == cur_op_id \ if op_id == cur_op_id \
or op_id == cur_dist_op.serial_op.desc.original_id(): or op_id == cur_dist_op.serial_op.desc.original_id():
dist_op = cur_dist_op dist_op = cur_dist_op
self._node_id_to_op_id[node.id()] = cur_op_id self._node_id_to_op_id[_node_id(node)] = cur_op_id
assert dist_op is not None, \ assert dist_op is not None, \
"Operator must have a distributed operator after the initialization for program." "Operator must have a distributed operator after the initialization for program."
serial_op_node_id = node.id() serial_op_node_id = _node_id(node)
new_dist_op = DistributedOperator(dist_op.serial_op, new_dist_op = DistributedOperator(dist_op.serial_op,
dist_op.dist_attr) dist_op.dist_attr)
self._dist_ops_for_graph[serial_op_node_id] = new_dist_op self._dist_ops_for_graph[serial_op_node_id] = new_dist_op
...@@ -402,10 +437,11 @@ class DistributedContext: ...@@ -402,10 +437,11 @@ class DistributedContext:
assert self._is_initialized_for_program and self._is_initialized_for_graph, \ assert self._is_initialized_for_program and self._is_initialized_for_graph, \
"Both program and graph must be initialized." "Both program and graph must be initialized."
updated_tensors = {} updated_tensors = {}
all_nodes = self._serial_graph.all_nodes() # all_nodes = self._serial_graph.all_nodes()
all_nodes = self._serial_ordered_nodes
for node in all_nodes: for node in all_nodes:
if node.is_var() and node.var() is not None: if node.is_var() and node.var() is not None:
tensor_id = self._node_id_to_tensor_id[node.id()] tensor_id = self._node_id_to_tensor_id[_node_id(node)]
updated = updated_tensors.get(tensor_id, False) updated = updated_tensors.get(tensor_id, False)
# If a var has multiples var nodes in graph, only use the first one for now # If a var has multiples var nodes in graph, only use the first one for now
if not updated: if not updated:
...@@ -416,16 +452,31 @@ class DistributedContext: ...@@ -416,16 +452,31 @@ class DistributedContext:
dist_tensor_for_program.dist_attr = tensor_dist_attr_for_graph dist_tensor_for_program.dist_attr = tensor_dist_attr_for_graph
updated_tensors[tensor_id] = True updated_tensors[tensor_id] = True
if node.is_op() and node.op() is not None: if node.is_op() and node.op() is not None:
op_id = self._node_id_to_op_id[node.id()] op_id = self._node_id_to_op_id[_node_id(node)]
op_dist_attr_for_graph = self.get_op_dist_attr_for_graph(node) op_dist_attr_for_graph = self.get_op_dist_attr_for_graph(node)
dist_op_for_program = self._dist_ops_for_program[op_id] dist_op_for_program = self._dist_ops_for_program[op_id]
dist_op_for_program.dist_attr = op_dist_attr_for_graph dist_op_for_program.dist_attr = op_dist_attr_for_graph
# TODO: the completion algorithm will skip orphan tensors,
# here we just set there process_mesh to the first one.
for orphan_node in self._serial_orphan_tensor_nodes:
serial_tensor_id = orphan_node.var().id()
dist_tensor = self._dist_tensors_for_program.get(serial_tensor_id,
None)
if dist_tensor:
dist_tensor.dist_attr.process_mesh = self._process_meshes[0]
else:
serial_tensor_id = orphan_node.var().original_id()
dist_tensor = self._dist_tensors_for_program.get(
serial_tensor_id, None)
dist_tensor.dist_attr.process_mesh = self._process_meshes[0]
def amend_dist_attr_for_program(self): def amend_dist_attr_for_program(self):
for dist_tensor in self._dist_tensors_for_program.values(): for dist_tensor in self._dist_tensors_for_program.values():
serial_tensor = dist_tensor.serial_tensor serial_tensor = dist_tensor.serial_tensor
dist_attr = dist_tensor.dist_attr dist_attr = dist_tensor.dist_attr
if serial_tensor.type == core.VarDesc.VarType.READER: if serial_tensor.type == core.VarDesc.VarType.READER \
or serial_tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
or serial_tensor.type == core.VarDesc.VarType.STEP_SCOPES:
tensor_shape = [] tensor_shape = []
else: else:
tensor_shape = serial_tensor.shape tensor_shape = serial_tensor.shape
...@@ -446,6 +497,7 @@ class DistributedContext: ...@@ -446,6 +497,7 @@ class DistributedContext:
tensor_shape = [] tensor_shape = []
else: else:
if dist_op.get_serial_input(arg_name).type == core.VarDesc.VarType.READER \ if dist_op.get_serial_input(arg_name).type == core.VarDesc.VarType.READER \
or dist_op.get_serial_input(arg_name).type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
or dist_op.serial_op.type == "create_py_reader": or dist_op.serial_op.type == "create_py_reader":
tensor_shape = [] tensor_shape = []
else: else:
...@@ -459,8 +511,9 @@ class DistributedContext: ...@@ -459,8 +511,9 @@ class DistributedContext:
and process_mesh_shape[dims_mapping[i]] > tensor_shape[i]: and process_mesh_shape[dims_mapping[i]] > tensor_shape[i]:
dims_mapping[i] = -1 dims_mapping[i] = -1
for arg_name in serial_op.output_arg_names: for arg_name in serial_op.output_arg_names:
if dist_op.get_serial_output( if dist_op.get_serial_output(arg_name).type == core.VarDesc.VarType.READER \
arg_name).type == core.VarDesc.VarType.READER: or dist_op.get_serial_output(arg_name).type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
or dist_op.get_serial_output(arg_name).type == core.VarDesc.VarType.STEP_SCOPES:
tensor_shape = [] tensor_shape = []
else: else:
tensor_shape = dist_op.get_serial_output(arg_name).shape tensor_shape = dist_op.get_serial_output(arg_name).shape
...@@ -498,7 +551,8 @@ class DistributedContext: ...@@ -498,7 +551,8 @@ class DistributedContext:
for k, v in self.__dict__.items(): for k, v in self.__dict__.items():
if k == "_serial_program" or k == "_serial_graph" \ if k == "_serial_program" or k == "_serial_graph" \
or k == "_dist_main_programs" or k == "_dist_startup_programs" \ or k == "_dist_main_programs" or k == "_dist_startup_programs" \
or k == "_serial_ordered_nodes": or k == "_serial_ordered_nodes" or k == "_serial_ordered_tensor_nodes" \
or k == "_serial_ordered_op_nodes":
setattr(result, k, v) setattr(result, k, v)
else: else:
setattr(result, k, copy.deepcopy(v, memo)) setattr(result, k, copy.deepcopy(v, memo))
......
...@@ -76,7 +76,8 @@ class DistributedOperator: ...@@ -76,7 +76,8 @@ class DistributedOperator:
if tensor is None: if tensor is None:
tensor_shape = [] tensor_shape = []
else: else:
if tensor.type == core.VarDesc.VarType.READER: if tensor.type == core.VarDesc.VarType.READER \
or tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
tensor_shape = [] tensor_shape = []
else: else:
tensor_shape = tensor.shape tensor_shape = tensor.shape
...@@ -86,7 +87,9 @@ class DistributedOperator: ...@@ -86,7 +87,9 @@ class DistributedOperator:
tensor_dims_mapping) tensor_dims_mapping)
for tensor_name in self._serial_op.output_arg_names: for tensor_name in self._serial_op.output_arg_names:
tensor = self._serial_op.block._var_recursive(tensor_name) tensor = self._serial_op.block._var_recursive(tensor_name)
if tensor.type == core.VarDesc.VarType.READER or tensor.type == core.VarDesc.VarType.STEP_SCOPES: if tensor.type == core.VarDesc.VarType.READER \
or tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
or tensor.type == core.VarDesc.VarType.STEP_SCOPES:
tensor_shape = [] tensor_shape = []
else: else:
tensor_shape = tensor.shape tensor_shape = tensor.shape
...@@ -95,6 +98,8 @@ class DistributedOperator: ...@@ -95,6 +98,8 @@ class DistributedOperator:
tensor_dims_mapping = [-1 for _ in range(len(tensor_shape))] tensor_dims_mapping = [-1 for _ in range(len(tensor_shape))]
self._dist_attr.set_output_dims_mapping(tensor_name, self._dist_attr.set_output_dims_mapping(tensor_name,
tensor_dims_mapping) tensor_dims_mapping)
if self._dist_attr.op_type is None:
self._dist_attr.op_type = self.serial_op.type
if self._dist_attr.impl_type is None: if self._dist_attr.impl_type is None:
self._dist_attr.impl_type = "default" self._dist_attr.impl_type = "default"
if self._dist_attr.impl_idx is None: if self._dist_attr.impl_idx is None:
...@@ -134,11 +139,15 @@ class DistributedOperator: ...@@ -134,11 +139,15 @@ class DistributedOperator:
return new_dist_attr return new_dist_attr
def validate_dist_attr(self): def validate_dist_attr(self):
if "read" in self.serial_op.type: if "read" in self.serial_op.type or "while" == self.serial_op.type:
return True return True
for name in self.serial_op.input_arg_names: for name in self.serial_op.input_arg_names:
input_dist_attr = self.dist_attr.get_input_dist_attr(name) input_dist_attr = self.dist_attr.get_input_dist_attr(name)
dims_mapping = input_dist_attr.dims_mapping dims_mapping = input_dist_attr.dims_mapping
if self.get_serial_input(
name).type == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
shape = []
else:
shape = self.get_serial_input(name).shape shape = self.get_serial_input(name).shape
if len(shape) != len(dims_mapping): if len(shape) != len(dims_mapping):
return False return False
...@@ -155,6 +164,10 @@ class DistributedOperator: ...@@ -155,6 +164,10 @@ class DistributedOperator:
for name in self.serial_op.output_arg_names: for name in self.serial_op.output_arg_names:
output_dist_attr = self.dist_attr.get_output_dist_attr(name) output_dist_attr = self.dist_attr.get_output_dist_attr(name)
dims_mapping = output_dist_attr.dims_mapping dims_mapping = output_dist_attr.dims_mapping
if self.get_serial_output(name).type == core.VarDesc.VarType.LOD_TENSOR_ARRAY\
or self.get_serial_output(name).type == core.VarDesc.VarType.STEP_SCOPES:
shape = []
else:
shape = self.get_serial_output(name).shape shape = self.get_serial_output(name).shape
if len(shape) != len(dims_mapping): if len(shape) != len(dims_mapping):
return False return False
...@@ -241,14 +254,14 @@ class DistributedModule: ...@@ -241,14 +254,14 @@ class DistributedModule:
def __call__(self, *args, **kwargs): def __call__(self, *args, **kwargs):
from .dist_context import get_default_distributed_context from .dist_context import get_default_distributed_context
main_prog = paddle.fluid.default_main_program() default_prog = paddle.fluid.default_main_program()
main_block = main_prog.global_block() cur_block = default_prog.current_block()
op_size = len(main_block.ops) op_size = len(cur_block.ops)
output = self._serial_module(*args, **kwargs) output = self._serial_module(*args, **kwargs)
new_op_size = len(main_block.ops) new_op_size = len(cur_block.ops)
default_dist_ctx = get_default_distributed_context() default_dist_ctx = get_default_distributed_context()
for idx in range(op_size, new_op_size): for idx in range(op_size, new_op_size):
op = main_block.ops[idx] op = cur_block.ops[idx]
dist_op = DistributedOperator(op, self._dist_attr) dist_op = DistributedOperator(op, self._dist_attr)
dist_op.dist_attr.mark_annotated_as(self._dist_attr) dist_op.dist_attr.mark_annotated_as(self._dist_attr)
default_dist_ctx.add_dist_op_for_program(dist_op) default_dist_ctx.add_dist_op_for_program(dist_op)
......
...@@ -184,7 +184,9 @@ class DistributedTensor: ...@@ -184,7 +184,9 @@ class DistributedTensor:
def _init_default_dist_attr(self): def _init_default_dist_attr(self):
if self._dist_attr.dims_mapping is None: if self._dist_attr.dims_mapping is None:
if self.serial_tensor.type == core.VarDesc.VarType.READER: if self.serial_tensor.type == core.VarDesc.VarType.READER \
or self.serial_tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
or self.serial_tensor.type == core.VarDesc.VarType.STEP_SCOPES:
tensor_shape = [] tensor_shape = []
else: else:
tensor_shape = self._serial_tensor.shape tensor_shape = self._serial_tensor.shape
...@@ -192,7 +194,9 @@ class DistributedTensor: ...@@ -192,7 +194,9 @@ class DistributedTensor:
self._dist_attr.dims_mapping = tensor_dims_mapping self._dist_attr.dims_mapping = tensor_dims_mapping
def validate_dist_attr(self): def validate_dist_attr(self):
if self.serial_tensor.type == core.VarDesc.VarType.READER: if self.serial_tensor.type == core.VarDesc.VarType.READER \
or self.serial_tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
or self.serial_tensor.type == core.VarDesc.VarType.STEP_SCOPES:
return True return True
tensor_shape = self.serial_tensor.shape tensor_shape = self.serial_tensor.shape
if len(tensor_shape) != len(self.dist_attr.dims_mapping): if len(tensor_shape) != len(self.dist_attr.dims_mapping):
......
...@@ -259,7 +259,7 @@ class Engine: ...@@ -259,7 +259,7 @@ class Engine:
"train_" + name: val "train_" + name: val
for name, val in logs.items() for name, val in logs.items()
} }
self._logger.info(logs) self._logger.info(train_logs)
def _train_step(self, data): def _train_step(self, data):
logs = {} logs = {}
......
...@@ -17,7 +17,9 @@ from ..dist_attribute import OperatorDistributedAttribute ...@@ -17,7 +17,9 @@ from ..dist_attribute import OperatorDistributedAttribute
_g_distributed_operator_impl_containers = {} _g_distributed_operator_impl_containers = {}
_g_elementwise_ops = ["elementwise_add", "gelu", "dropout", "cast"] _g_elementwise_ops = [
"elementwise_add", "gelu", "dropout", "cast", "gather", "concat"
]
BACKWARD_ONLY_DIST_OPS = {'check_finite_and_unscale', 'update_loss_scaling'} BACKWARD_ONLY_DIST_OPS = {'check_finite_and_unscale', 'update_loss_scaling'}
......
...@@ -55,9 +55,14 @@ class DistributedDefaultImpl0(DistributedOperatorImpl): ...@@ -55,9 +55,14 @@ class DistributedDefaultImpl0(DistributedOperatorImpl):
op_dist_attr = dist_op.dist_attr op_dist_attr = dist_op.dist_attr
for arg_name in op_desc.input_arg_names(): for arg_name in op_desc.input_arg_names():
serial_tensor = dist_op.get_serial_input(arg_name) serial_tensor = dist_op.get_serial_input(arg_name)
if serial_tensor.is_parameter:
continue
dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name) dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name)
if serial_tensor.is_parameter:
for mapping in dims_mapping:
if mapping != -1:
return False
# continue
# if len(dims_mapping) < 1:
# continue
if len(dims_mapping) > 1: if len(dims_mapping) > 1:
for mapping in dims_mapping[1:]: for mapping in dims_mapping[1:]:
if mapping != -1: if mapping != -1:
...@@ -73,9 +78,14 @@ class DistributedDefaultImpl0(DistributedOperatorImpl): ...@@ -73,9 +78,14 @@ class DistributedDefaultImpl0(DistributedOperatorImpl):
xshape_arg_names = op_desc.output("XShape") xshape_arg_names = op_desc.output("XShape")
for arg_name in op_desc.output_arg_names(): for arg_name in op_desc.output_arg_names():
serial_tensor = dist_op.get_serial_output(arg_name) serial_tensor = dist_op.get_serial_output(arg_name)
if serial_tensor.is_parameter:
continue
dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name) dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
if serial_tensor.is_parameter:
for mapping in dims_mapping:
if mapping != -1:
return False
# continue
# if len(dims_mapping) < 1:
# continue
if arg_name not in xshape_arg_names: if arg_name not in xshape_arg_names:
if len(dims_mapping) > 1: if len(dims_mapping) > 1:
for mapping in dims_mapping[1:]: for mapping in dims_mapping[1:]:
...@@ -104,6 +114,7 @@ class DistributedDefaultImpl0(DistributedOperatorImpl): ...@@ -104,6 +114,7 @@ class DistributedDefaultImpl0(DistributedOperatorImpl):
for mapping in dims_mapping[1:]: for mapping in dims_mapping[1:]:
if mapping != -1: if mapping != -1:
return False return False
if len(dims_mapping) >= 1:
batch_dim_mappings.append(dims_mapping[0]) batch_dim_mappings.append(dims_mapping[0])
# Check output compatibility # Check output compatibility
...@@ -121,6 +132,7 @@ class DistributedDefaultImpl0(DistributedOperatorImpl): ...@@ -121,6 +132,7 @@ class DistributedDefaultImpl0(DistributedOperatorImpl):
for mapping in dims_mapping[1:]: for mapping in dims_mapping[1:]:
if mapping != -1: if mapping != -1:
return False return False
if len(dims_mapping) >= 1:
batch_dim_mappings.append(dims_mapping[0]) batch_dim_mappings.append(dims_mapping[0])
else: else:
if dims_mapping[0] != -1: if dims_mapping[0] != -1:
...@@ -129,6 +141,7 @@ class DistributedDefaultImpl0(DistributedOperatorImpl): ...@@ -129,6 +141,7 @@ class DistributedDefaultImpl0(DistributedOperatorImpl):
for mapping in dims_mapping[2:]: for mapping in dims_mapping[2:]:
if mapping != -1: if mapping != -1:
return False return False
if len(dims_mapping) >= 2:
batch_dim_mappings.append(dims_mapping[1]) batch_dim_mappings.append(dims_mapping[1])
# Check batch dim mapping compatibility # Check batch dim mapping compatibility
...@@ -143,7 +156,9 @@ class DistributedDefaultImpl0(DistributedOperatorImpl): ...@@ -143,7 +156,9 @@ class DistributedDefaultImpl0(DistributedOperatorImpl):
op_desc = dist_op.serial_op.desc op_desc = dist_op.serial_op.desc
op_dist_attr = dist_op.dist_attr op_dist_attr = dist_op.dist_attr
# The following statement will be replaced by a more elegent way # The following statement will be replaced by a more elegent way
if op_desc.type() == "shape" or op_desc.type() == "slice": if op_desc.type() == "shape" \
or op_desc.type() == "slice" \
or op_desc.type() == "while":
return False return False
output_names = op_desc.output_names() output_names = op_desc.output_names()
xshape_arg_names = [] xshape_arg_names = []
...@@ -155,6 +170,7 @@ class DistributedDefaultImpl0(DistributedOperatorImpl): ...@@ -155,6 +170,7 @@ class DistributedDefaultImpl0(DistributedOperatorImpl):
if serial_tensor.is_parameter: if serial_tensor.is_parameter:
continue continue
dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name) dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name)
if len(dims_mapping) >= 1:
batch_dim_mappings.append(dims_mapping[0]) batch_dim_mappings.append(dims_mapping[0])
for arg_name in op_desc.output_arg_names(): for arg_name in op_desc.output_arg_names():
serial_tensor = dist_op.get_serial_output(arg_name) serial_tensor = dist_op.get_serial_output(arg_name)
...@@ -162,10 +178,14 @@ class DistributedDefaultImpl0(DistributedOperatorImpl): ...@@ -162,10 +178,14 @@ class DistributedDefaultImpl0(DistributedOperatorImpl):
continue continue
dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name) dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
if arg_name not in xshape_arg_names: if arg_name not in xshape_arg_names:
if len(dims_mapping) >= 1:
batch_dim_mappings.append(dims_mapping[0]) batch_dim_mappings.append(dims_mapping[0])
else: else:
batch_dim_mappings.append(dims_mapping[1]) batch_dim_mappings.append(dims_mapping[1])
if not batch_dim_mappings:
return changed
compatible_dim_mapping = compute_compatible_dim_mapping( compatible_dim_mapping = compute_compatible_dim_mapping(
batch_dim_mappings) batch_dim_mappings)
assert compatible_dim_mapping is not None, "There is no compatible dim mapping." assert compatible_dim_mapping is not None, "There is no compatible dim mapping."
...@@ -174,7 +194,8 @@ class DistributedDefaultImpl0(DistributedOperatorImpl): ...@@ -174,7 +194,8 @@ class DistributedDefaultImpl0(DistributedOperatorImpl):
if serial_tensor.is_parameter: if serial_tensor.is_parameter:
continue continue
dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name) dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name)
if compatible_dim_mapping != dims_mapping[0]: if len(dims_mapping
) >= 1 and compatible_dim_mapping != dims_mapping[0]:
dims_mapping[0] = compatible_dim_mapping dims_mapping[0] = compatible_dim_mapping
changed = True changed = True
for arg_name in op_desc.output_arg_names(): for arg_name in op_desc.output_arg_names():
...@@ -183,11 +204,13 @@ class DistributedDefaultImpl0(DistributedOperatorImpl): ...@@ -183,11 +204,13 @@ class DistributedDefaultImpl0(DistributedOperatorImpl):
continue continue
dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name) dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
if arg_name not in xshape_arg_names: if arg_name not in xshape_arg_names:
if compatible_dim_mapping != dims_mapping[0]: if len(dims_mapping
) >= 1 and compatible_dim_mapping != dims_mapping[0]:
dims_mapping[0] = compatible_dim_mapping dims_mapping[0] = compatible_dim_mapping
changed = True changed = True
else: else:
if compatible_dim_mapping != dims_mapping[1]: if len(dims_mapping
) >= 2 and compatible_dim_mapping != dims_mapping[1]:
dims_mapping[1] = compatible_dim_mapping dims_mapping[1] = compatible_dim_mapping
changed = True changed = True
......
...@@ -1432,7 +1432,6 @@ class DistributedMatmulV2Impl2(DistributedOperatorImpl): ...@@ -1432,7 +1432,6 @@ class DistributedMatmulV2Impl2(DistributedOperatorImpl):
if is_valid_list_index(y_dims_mapping, if is_valid_list_index(y_dims_mapping,
-2) and is_dim_shard(y_dims_mapping[-2]): -2) and is_dim_shard(y_dims_mapping[-2]):
return False return False
return True return True
def is_output_compatible(self, dist_op): def is_output_compatible(self, dist_op):
......
...@@ -1271,7 +1271,6 @@ def get_all_distributed_main_program(serial_program_info, dist_context, ...@@ -1271,7 +1271,6 @@ def get_all_distributed_main_program(serial_program_info, dist_context,
used_dist_context._dist_op_context = DistributedOperatorContext() used_dist_context._dist_op_context = DistributedOperatorContext()
_, _, dist_startup_program, dist_main_program, _ = copied_parallelizer._get_dist_program( _, _, dist_startup_program, dist_main_program, _ = copied_parallelizer._get_dist_program(
rank_id, used_dist_context) rank_id, used_dist_context)
# print("dist_main_program: ", dist_main_program)
all_dist_main_program.append(dist_main_program) all_dist_main_program.append(dist_main_program)
return all_dist_main_program return all_dist_main_program
......
...@@ -228,3 +228,5 @@ if core.is_compiled_with_npu(): ...@@ -228,3 +228,5 @@ if core.is_compiled_with_npu():
atexit.register(core.clear_executor_cache) atexit.register(core.clear_executor_cache)
# NOTE(Aganlengzi): clean up KernelFactory in advance manually. # NOTE(Aganlengzi): clean up KernelFactory in advance manually.
atexit.register(core.clear_kernel_factory) atexit.register(core.clear_kernel_factory)
# NOTE(wangran16): clean up DeviceManger in advance manually.
atexit.register(core.clear_device_manager)
...@@ -173,6 +173,9 @@ if core.is_compiled_with_xpu(): ...@@ -173,6 +173,9 @@ if core.is_compiled_with_xpu():
elif core.is_compiled_with_npu(): elif core.is_compiled_with_npu():
_, _, _sys_unsupported_fp16_list = core.op_supported_infos( _, _, _sys_unsupported_fp16_list = core.op_supported_infos(
'NPU', core.VarDesc.VarType.FP16) 'NPU', core.VarDesc.VarType.FP16)
elif core.is_compiled_with_mlu():
_, _, _sys_unsupported_fp16_list = core.op_supported_infos(
'MLU', core.VarDesc.VarType.FP16)
else: else:
_, _, _sys_unsupported_fp16_list = core.op_supported_infos( _, _, _sys_unsupported_fp16_list = core.op_supported_infos(
'GPU', core.VarDesc.VarType.FP16) 'GPU', core.VarDesc.VarType.FP16)
......
...@@ -29,10 +29,11 @@ from .asp import decorate ...@@ -29,10 +29,11 @@ from .asp import decorate
from .asp import prune_model from .asp import prune_model
from .asp import set_excluded_layers from .asp import set_excluded_layers
from .asp import reset_excluded_layers from .asp import reset_excluded_layers
from .supported_layer_list import add_supported_layer
__all__ = [ __all__ = [
'calculate_density', 'check_mask_1d', 'get_mask_1d', 'check_mask_2d', 'calculate_density', 'check_mask_1d', 'get_mask_1d', 'check_mask_2d',
'get_mask_2d_greedy', 'get_mask_2d_best', 'create_mask', 'check_sparsity', 'get_mask_2d_greedy', 'get_mask_2d_best', 'create_mask', 'check_sparsity',
'MaskAlgo', 'CheckMethod', 'decorate', 'prune_model', 'set_excluded_layers', 'MaskAlgo', 'CheckMethod', 'decorate', 'prune_model', 'set_excluded_layers',
'reset_excluded_layers' 'reset_excluded_layers', 'add_supported_layer'
] ]
...@@ -23,6 +23,8 @@ import paddle ...@@ -23,6 +23,8 @@ import paddle
from paddle.fluid import global_scope, program_guard, layers from paddle.fluid import global_scope, program_guard, layers
from paddle.fluid.initializer import ConstantInitializer from paddle.fluid.initializer import ConstantInitializer
from paddle.fluid.contrib import sparsity from paddle.fluid.contrib import sparsity
from paddle.fluid.contrib.sparsity.supported_layer_list import supported_layers_and_prune_func_map
from paddle.fluid.contrib.sparsity.supported_layer_list import _default_pruning
from paddle.fluid import core from paddle.fluid import core
OpRole = core.op_proto_and_checker_maker.OpRole OpRole = core.op_proto_and_checker_maker.OpRole
...@@ -292,8 +294,8 @@ class ASPHelper(object): ...@@ -292,8 +294,8 @@ class ASPHelper(object):
2. pruning well-trained models into 2:4 sparse pattern on FP16 or 1:2 sparse pattern on FP32 for fine-tuning. 2. pruning well-trained models into 2:4 sparse pattern on FP16 or 1:2 sparse pattern on FP32 for fine-tuning.
""" """
MASK_APPENDDED_NAME = '_asp_mask' MASK_APPENDDED_NAME = 'asp_mask'
SUPPORTED_LAYERS = {'fc': 'w_0', 'linear': 'w_0', 'conv2d': 'w_0'} PADDLE_WEIGHT_SUFFIX = "w_"
__asp_info = {} __asp_info = {}
...@@ -334,7 +336,6 @@ class ASPHelper(object): ...@@ -334,7 +336,6 @@ class ASPHelper(object):
r""" r"""
This is the implementation of `sparsity.prune_model`, for details please see explanation in `sparsity.prune_model`. This is the implementation of `sparsity.prune_model`, for details please see explanation in `sparsity.prune_model`.
""" """
checked_func_name = sparsity.CheckMethod.get_checking_method(mask_algo)
if main_program is None: if main_program is None:
main_program = paddle.static.default_main_program() main_program = paddle.static.default_main_program()
...@@ -345,33 +346,27 @@ class ASPHelper(object): ...@@ -345,33 +346,27 @@ class ASPHelper(object):
weight_tensor = global_scope().find_var(param.name).get_tensor() weight_tensor = global_scope().find_var(param.name).get_tensor()
weight_nparray = np.array(weight_tensor) weight_nparray = np.array(weight_tensor)
# The double transpose ops here make sure pruning direction consistent with cuSparseLt. prune_func = ASPHelper._get_prune_func_by_name(param.name)
# SPMMA in cuSparseLt: D = (AxB) + C, where matrix A (mxk) is sparse matrix.
# cuSparseLt would prune matrix A along k dimension. weight_pruned_nparray, weight_sparse_mask = \
# In sparse training, layer weight matriices is viewed sparse matrix A, so prune_func(weight_nparray, m, n, mask_algo, param.name)
# the math fomula should be 'Act(WX + b)'. However, default fomula in PaddlePaddle weight_pruned_nparray = weight_pruned_nparray.astype(
# is 'Act(XW + b)'. For enabling SPMMA, weights and inputs should be transposed weight_nparray.dtype)
# for computing, Act( (W^T X^T)^T + b). Therefore, we have to prune alog k dimension
# of W^T, which is m dimension of W. Moreove, all mask generating functions in
# sparsity/utils is row-major pruning. That is the reason we have to transpose weight
# matrices beforce invoking create_mask. Then we transpose the result maks to make
# sure its shape to be the same as the input weight.
weight_sparse_mask = sparsity.create_mask(
weight_nparray.T, func_name=mask_algo, n=n, m=m).T
weight_pruned_nparray = np.multiply(weight_nparray,
weight_sparse_mask)
weight_tensor.set(weight_pruned_nparray, place) weight_tensor.set(weight_pruned_nparray, place)
assert sparsity.check_sparsity(weight_pruned_nparray.T, n=n, m=m, func_name=checked_func_name), \
'Pruning {} weight matrix failure!!!'.format(param.name)
if with_mask: if with_mask:
weight_mask_param = global_scope().find_var( weight_mask_param = global_scope().find_var(
ASPHelper._get_mask_name(param.name)) ASPHelper._get_mask_name(param.name))
assert weight_mask_param is not None, \ assert weight_mask_param is not None, \
'Cannot find {} variable, please call ASPHelper.minimize' \ 'Cannot find {} variable, please call optimizer.minimize (' \
'paddle.sparsity.decorate(optimizer).minimize(loss)' \
' and initialization (exe.run(startup_program)) first!'.format(ASPHelper._get_mask_name(param.name)) ' and initialization (exe.run(startup_program)) first!'.format(ASPHelper._get_mask_name(param.name))
weight_mask_tensor = weight_mask_param.get_tensor() weight_mask_tensor = weight_mask_param.get_tensor()
weight_sparse_mask = weight_sparse_mask.astype(
np.array(weight_mask_tensor).dtype)
weight_mask_tensor.set(weight_sparse_mask, place) weight_mask_tensor.set(weight_sparse_mask, place)
asp_info.update_masks(param.name, weight_sparse_mask) asp_info.update_masks(param.name, weight_sparse_mask)
return asp_info.masks.copy() return asp_info.masks.copy()
@staticmethod @staticmethod
...@@ -384,7 +379,7 @@ class ASPHelper(object): ...@@ -384,7 +379,7 @@ class ASPHelper(object):
Returns: Returns:
string: The mask name of :attr:`param_name`. string: The mask name of :attr:`param_name`.
""" """
return param_name + ASPHelper.MASK_APPENDDED_NAME return param_name + "." + ASPHelper.MASK_APPENDDED_NAME
@staticmethod @staticmethod
def _get_not_ASP_relevant_vars(main_program): def _get_not_ASP_relevant_vars(main_program):
...@@ -434,19 +429,46 @@ class ASPHelper(object): ...@@ -434,19 +429,46 @@ class ASPHelper(object):
# fc_0.w_0 -> True # fc_0.w_0 -> True
# fc_0.b_0 -> False # fc_0.b_0 -> False
""" """
if ASPHelper.MASK_APPENDDED_NAME in param_name: param_name_list = param_name.split('.')
if ASPHelper.MASK_APPENDDED_NAME in param_name_list:
return False return False
for layer in cls._get_program_asp_info(main_program).excluded_layers: for layer in cls._get_program_asp_info(main_program).excluded_layers:
if layer in param_name: if layer in param_name:
return False return False
for name in ASPHelper.SUPPORTED_LAYERS: if param_name in supported_layers_and_prune_func_map:
if name in param_name and \
ASPHelper.SUPPORTED_LAYERS[name] in param_name:
return True return True
param_name_no_weight_suffix = param_name_list[0]
param_type_suffix = param_name_list[1]
layer_name = param_name_no_weight_suffix[:param_name_no_weight_suffix.
rfind('_')]
if ASPHelper.PADDLE_WEIGHT_SUFFIX not in param_type_suffix:
return False return False
if param_name_no_weight_suffix in supported_layers_and_prune_func_map or \
layer_name in supported_layers_and_prune_func_map:
return True
return False
@classmethod
def _get_prune_func_by_name(cls, param_name):
func = supported_layers_and_prune_func_map.get(param_name, None)
param_name_no_weight_suffix = param_name.split('.')[0]
if func is None:
func = supported_layers_and_prune_func_map.get(
param_name_no_weight_suffix, None)
if func is None:
layer_name = param_name_no_weight_suffix[:
param_name_no_weight_suffix.
rfind('_')]
func = supported_layers_and_prune_func_map.get(layer_name,
_default_pruning)
return func
@classmethod @classmethod
def _minimize(cls, def _minimize(cls,
optimizer, optimizer,
...@@ -509,8 +531,7 @@ class ASPHelper(object): ...@@ -509,8 +531,7 @@ class ASPHelper(object):
if ASPHelper._is_supported_layer(main_program, if ASPHelper._is_supported_layer(main_program,
param_and_grad[0].name): param_and_grad[0].name):
mask_param = layers.create_parameter( mask_param = layers.create_parameter(
name=param_and_grad[0].name + name=ASPHelper._get_mask_name(param_and_grad[0].name),
ASPHelper.MASK_APPENDDED_NAME,
shape=param_and_grad[0].shape, shape=param_and_grad[0].shape,
dtype=param_and_grad[0].dtype, dtype=param_and_grad[0].dtype,
default_initializer=ConstantInitializer(value=1.0)) default_initializer=ConstantInitializer(value=1.0))
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import paddle
from paddle.fluid.contrib import sparsity
import threading
__all__ = ['add_supported_layer']
def _default_pruning(weight_nparray, m, n, func_name, param_name):
checked_func_name = sparsity.CheckMethod.get_checking_method(func_name)
# The double transpose ops here make sure pruning direction consistent with cuSparseLt.
# SPMMA in cuSparseLt: D = (AxB) + C, where matrix A (mxk) is sparse matrix.
# cuSparseLt would prune matrix A along k dimension.
# In sparse training, layer weight matrices is viewed sparse matrix A, so
# the math fomula should be 'Act(WX + b)'. However, default fomula in PaddlePaddle
# is 'Act(XW + b)'. For enabling SPMMA, weights and inputs should be transposed
# for computing, Act( (W^T X^T)^T + b). Therefore, we have to prune alog k dimension
# of W^T, which is m dimension of W. Moreove, all mask generating functions in
# sparsity/utils is row-major pruning. That is the reason we have to transpose weight
# matrices beforce invoking create_mask. Then we transpose the result mask to make
# sure its shape to be the same as the input weight.
weight_sparse_mask = sparsity.create_mask(
weight_nparray.T, func_name=func_name, n=n, m=m).T
weight_pruned_nparray = np.multiply(weight_nparray, weight_sparse_mask)
assert sparsity.check_sparsity(weight_pruned_nparray.T, n=n, m=m, func_name=checked_func_name), \
'Pruning {} weight matrix failure!!!'.format(param_name)
return weight_pruned_nparray, weight_sparse_mask
# When value of given key in this DICT is None,
# ASP will call default pruning function in pruning stage.
_supported_layers_and_prune_func_map_lock = threading.Lock()
supported_layers_and_prune_func_map = {}
def add_supported_layer(layer, pruning_func=None):
r"""
Add supported layers and its corresponding pruning function.
Args:
name (string|Layer): The name or type of layer, needed to support. If layer is `Layer` then
it would be turn to string internally. ASP would use this name to match parameter's name and call
its the corresponding pruning function.
pruning_func (function, optional): a function type which receives five argument (weight_nparray,
m, n, func_name, param_name), weight_nparray is a nparray of weight, param_name is the name of weight,
m, n, and func_name, please see `prune_model` for details.
"""
name = None
if isinstance(layer, str):
name = layer
elif isinstance(layer, paddle.fluid.dygraph.layers.Layer):
name = paddle.fluid.dygraph.layers._convert_camel_to_snake(
type(layer).__name__)
elif issubclass(layer, paddle.fluid.dygraph.layers.Layer):
name = paddle.fluid.dygraph.layers._convert_camel_to_snake(
layer.__name__)
else:
assert "The type of layer should be string of Layer, but got {}!".format(
type(layer))
if pruning_func is None:
pruning_func = _default_pruning
_supported_layers_and_prune_func_map_lock.acquire()
supported_layers_and_prune_func_map.update({name: pruning_func})
_supported_layers_and_prune_func_map_lock.release()
add_supported_layer('fc')
add_supported_layer('linear')
add_supported_layer('conv2d')
...@@ -564,6 +564,14 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase): ...@@ -564,6 +564,14 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
self._rcvd_idx += 1 self._rcvd_idx += 1
self._batches_outstanding -= 1 self._batches_outstanding -= 1
else: else:
# NOTE: when _rcvd_idx catch up _send_idx, which means
# one of following:
# 1. all 2 * num_workers batches have been loaded
# and stored in _blocking_queue
# 2. all data drained
# we need to let _thread blocking at _data_queue
# get_data to inoccupy CPU, otherwise may occupy
# CPU time for model running
# NOTE: in persistent workers mode, do not check data # NOTE: in persistent workers mode, do not check data
# drained here, simply let it go to _data_queue # drained here, simply let it go to _data_queue
# reading to get _ResumeIteration # reading to get _ResumeIteration
...@@ -573,7 +581,6 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase): ...@@ -573,7 +581,6 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
# may also be data in blocking queue # may also be data in blocking queue
if self._batches_outstanding < len(self._places): if self._batches_outstanding < len(self._places):
return None return None
continue
if self._rcvd_idx in self._task_infos and \ if self._rcvd_idx in self._task_infos and \
len(self._task_infos[self._rcvd_idx]) == 3: len(self._task_infos[self._rcvd_idx]) == 3:
......
...@@ -271,13 +271,14 @@ def amp_guard(enable=True, ...@@ -271,13 +271,14 @@ def amp_guard(enable=True,
"current_tracer is None, maybe it is not in imperative mode.") "current_tracer is None, maybe it is not in imperative mode.")
# check device_type: # check device_type:
# NOTE: Now, amp only support gpu for float16 and bfloat16, xpu for float16, npu for float16. # NOTE: Now, amp only support gpu for float16 and bfloat16, xpu for float16, mlu for float16, npu for float16.
# Maybe we will support cpu for bfloat16. # Maybe we will support cpu for bfloat16.
if enable and not (tracer._expected_place.is_gpu_place() or if enable and not (tracer._expected_place.is_gpu_place() or
tracer._expected_place.is_xpu_place() or tracer._expected_place.is_xpu_place() or
tracer._expected_place.is_mlu_place() or
tracer._expected_place.is_npu_place()): tracer._expected_place.is_npu_place()):
warnings.warn( warnings.warn(
'amp_guard can only be enabled on CUDAPlace, XPUPlace, and NPUPlace, current place is %s, so it makes no effect.' 'amp_guard can only be enabled on CUDAPlace, XPUPlace, MLUPlace, and NPUPlace, current place is %s, so it makes no effect.'
% tracer._expected_place) % tracer._expected_place)
enable = False enable = False
# For npu: # For npu:
...@@ -288,6 +289,10 @@ def amp_guard(enable=True, ...@@ -288,6 +289,10 @@ def amp_guard(enable=True,
if tracer._expected_place.is_xpu_place() and (dtype == 'bfloat16'): if tracer._expected_place.is_xpu_place() and (dtype == 'bfloat16'):
warnings.warn('XPUPlace only support float16 amp.') warnings.warn('XPUPlace only support float16 amp.')
enable = False enable = False
# For mlu:
if tracer._expected_place.is_mlu_place() and (dtype == 'bfloat16'):
warnings.warn('MLUPlace only support float16 amp.')
enable = False
# For gpu float16: Compute Capability should >= 7. # For gpu float16: Compute Capability should >= 7.
# For gpu bfloat16: Compute Capability should >= 8 & CUDA Version should >= 11. # For gpu bfloat16: Compute Capability should >= 8 & CUDA Version should >= 11.
if tracer._expected_place.is_gpu_place(): if tracer._expected_place.is_gpu_place():
......
...@@ -106,9 +106,10 @@ class AmpScaler(object): ...@@ -106,9 +106,10 @@ class AmpScaler(object):
if enable and not (tracer._expected_place.is_gpu_place() or if enable and not (tracer._expected_place.is_gpu_place() or
tracer._expected_place.is_xpu_place() or tracer._expected_place.is_xpu_place() or
tracer._expected_place.is_mlu_place() or
tracer._expected_place.is_npu_place()): tracer._expected_place.is_npu_place()):
warnings.warn( warnings.warn(
'AmpScaler can only be enabled on CUDAPlace, XPUPlace and NPUPlace, current place is %s, so it makes no effect.' 'AmpScaler can only be enabled on CUDAPlace, XPUPlace, MLUPlace and NPUPlace, current place is %s, so it makes no effect.'
% tracer._expected_place) % tracer._expected_place)
enable = False enable = False
......
...@@ -6299,7 +6299,14 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None): ...@@ -6299,7 +6299,14 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
if dim_size == -1: if dim_size == -1:
assert unk_dim_idx == -1, ( assert unk_dim_idx == -1, (
"Only one dimension value of 'shape' in reshape can " "Only one dimension value of 'shape' in reshape can "
"be -1. But received shape[%d] is also -1." % dim_idx) "be -1. But received shape[%d] is also -1.\n"
"\n\t# N = x.shape()[2]\t\t# N is an int. "
"(NOT recommend under @to_static)\n\tN = paddle.shape(x)[2]\t\t"
"# N is a Tensor. (Recommend)\n\tz = paddle.reshape([N, -1, 4])"
"\t# z.shape is [-1, -1, 4]\n\n"
" If your target shape in Reshape represents dynamic shape, "
"please turn it into a Tensor under @to_static. See above example for details."
% dim_idx)
unk_dim_idx = dim_idx unk_dim_idx = dim_idx
elif dim_size == 0: elif dim_size == 0:
assert dim_idx < len(x.shape), ( assert dim_idx < len(x.shape), (
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.contrib import sparsity
from paddle.fluid.contrib.sparsity.supported_layer_list import supported_layers_and_prune_func_map
from paddle.fluid.dygraph.layers import Layer, _convert_camel_to_snake
class MyOwnLayer(Layer):
def __init__(self):
super(MyOwnLayer, self).__init__()
def forward(self, x):
return x
static_tensor = None
static_tensor_mask = None
def my_own_pruning(tensor, m, n, mask_algo, param_name):
global static_tensor
global static_tensor_mask
if static_tensor is None:
static_tensor = np.random.rand(*tensor.shape).astype(np.float32)
if static_tensor_mask is None:
static_tensor_mask = np.random.rand(*tensor.shape).astype(np.float32)
return static_tensor, static_tensor_mask
class TestASPAddSupportedLayer(unittest.TestCase):
def test_add_supported_layer_via_name(self):
sparsity.add_supported_layer("test_supported_1")
sparsity.add_supported_layer("test_supported_2", my_own_pruning)
sparsity.add_supported_layer(MyOwnLayer)
my_own_layer_name = _convert_camel_to_snake(MyOwnLayer.__name__)
self.assertTrue(
"test_supported_1" in supported_layers_and_prune_func_map)
self.assertTrue(
"test_supported_2" in supported_layers_and_prune_func_map)
self.assertTrue(
"test_supported_2" in supported_layers_and_prune_func_map)
self.assertTrue(supported_layers_and_prune_func_map["test_supported_2"]
== my_own_pruning)
self.assertTrue(
my_own_layer_name in supported_layers_and_prune_func_map)
class TestASPStaticCustomerizedPruneFunc(unittest.TestCase):
def setUp(self):
paddle.enable_static()
self.main_program = fluid.Program()
self.startup_program = fluid.Program()
self.customer_prefix = "customer_layer"
def build_model():
img = fluid.data(
name='img', shape=[None, 3, 32, 32], dtype='float32')
label = fluid.data(name='label', shape=[None, 1], dtype='int64')
hidden = fluid.layers.conv2d(
input=img, num_filters=4, filter_size=3, padding=2, act="relu")
hidden = fluid.layers.fc(input=hidden,
size=32,
act='relu',
name=self.customer_prefix)
hidden = fluid.layers.fc(input=hidden,
size=32,
act='relu',
name=self.customer_prefix)
hidden = fluid.layers.fc(input=hidden, size=32, act='relu')
prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
return img, label, prediction
with fluid.program_guard(self.main_program, self.startup_program):
self.img, self.label, self.predict = build_model()
self.supported_layer_count_ref = 5
self.place = paddle.CPUPlace()
if core.is_compiled_with_cuda():
self.place = paddle.CUDAPlace(0)
self.exe = fluid.Executor(self.place)
sparsity.add_supported_layer(self.customer_prefix, my_own_pruning)
def test_inference_pruning(self):
self.exe.run(self.startup_program)
sparsity.prune_model(
self.main_program, mask_algo="mask_1d", with_mask=False)
supported_layer_count = 0
for param in self.main_program.global_block().all_parameters():
mat = np.array(fluid.global_scope().find_var(param.name).get_tensor(
))
if sparsity.asp.ASPHelper._is_supported_layer(self.main_program,
param.name):
supported_layer_count += 1
if (self.customer_prefix in param.name):
self.assertLessEqual(
np.sum(mat.flatten() - static_tensor.flatten()), 1e-4)
else:
self.assertTrue(
sparsity.check_sparsity(
mat.T,
func_name=sparsity.CheckMethod.CHECK_1D,
n=2,
m=4))
self.assertEqual(supported_layer_count, self.supported_layer_count_ref)
def test_training_pruning(self):
with fluid.program_guard(self.main_program, self.startup_program):
loss = fluid.layers.mean(
fluid.layers.cross_entropy(
input=self.predict, label=self.label))
optimizer = sparsity.decorate(
fluid.optimizer.SGD(learning_rate=0.01))
optimizer.minimize(loss, self.startup_program)
self.exe.run(self.startup_program)
sparsity.prune_model(
self.main_program, mask_algo="mask_1d", with_mask=True)
supported_layer_count = 0
for param in self.main_program.global_block().all_parameters():
mat = np.array(fluid.global_scope().find_var(param.name).get_tensor(
))
if sparsity.asp.ASPHelper._is_supported_layer(self.main_program,
param.name):
mat_mask = np.array(fluid.global_scope().find_var(
sparsity.asp.ASPHelper._get_mask_name(param.name))
.get_tensor())
supported_layer_count += 1
if (self.customer_prefix in param.name):
self.assertLessEqual(
np.sum(mat.flatten() - static_tensor.flatten()), 1e-4)
self.assertLessEqual(
np.sum(mat_mask.flatten() - static_tensor_mask.flatten(
)), 1e-4)
else:
self.assertTrue(
sparsity.check_sparsity(
mat.T,
func_name=sparsity.CheckMethod.CHECK_1D,
n=2,
m=4))
self.assertTrue(
sparsity.check_sparsity(
mat_mask.T,
func_name=sparsity.CheckMethod.CHECK_1D,
n=2,
m=4))
self.assertEqual(supported_layer_count, self.supported_layer_count_ref)
if __name__ == '__main__':
unittest.main()
...@@ -9,6 +9,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU) ...@@ -9,6 +9,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
set_tests_properties(test_relaunch_with_gpt_planner PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 240) set_tests_properties(test_relaunch_with_gpt_planner PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 240)
py_test_modules(test_engine_api MODULES test_engine_api ENVS ${dist_ENVS}) py_test_modules(test_engine_api MODULES test_engine_api ENVS ${dist_ENVS})
set_tests_properties(test_engine_api PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 80) set_tests_properties(test_engine_api PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 80)
py_test_modules(test_while_op_completion MODULES test_while_op_completion ENVS ${dist_ENVS})
py_test_modules(test_converter MODULES test_converter ENVS ${dist_ENVS}) py_test_modules(test_converter MODULES test_converter ENVS ${dist_ENVS})
set_tests_properties(test_converter PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50) set_tests_properties(test_converter PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50)
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle
import numpy as np
import paddle.nn as nn
import paddle.utils as utils
import paddle.static as static
import paddle.nn.functional as F
import paddle.distributed.auto_parallel as auto
from paddle.distributed import fleet
from paddle.distributed.auto_parallel.completion import Completer
from paddle.distributed.auto_parallel.partitioner import Partitioner
from paddle.distributed.auto_parallel.utils import make_data_unshard
from paddle.distributed.auto_parallel.dist_attribute import OperatorDistributedAttribute, TensorDistributedAttribute
from paddle.distributed.auto_parallel.dist_context import DistributedContext, get_default_distributed_context
from paddle.distributed.auto_parallel.operators import find_best_compatible_distributed_operator_impl
from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
paddle.enable_static()
batch_size = 4
epoch_num = 10
hidden_size = 1024
sequence_len = 512
_g_process_mesh = [[0, 1], [2, 3]]
def get_random_inputs_and_labels(input_shape, label_shape):
input = np.random.random(size=input_shape).astype('float32')
label = np.random.random(size=label_shape).astype('float32')
return input, label
def batch_generator_creator():
def __reader__():
for _ in range(batch_size):
batch_input, batch_label = get_random_inputs_and_labels(
[batch_size, sequence_len, hidden_size],
[batch_size, sequence_len, 1])
yield batch_input, batch_label
return __reader__
class MLPLayer(nn.Layer):
def __init__(self,
hidden_size=1024,
intermediate_size=4 * 1024,
dropout_ratio=0.1,
initializer_range=0.02):
super(MLPLayer, self).__init__()
d_model = hidden_size
dim_feedforward = intermediate_size
param_initializer = nn.initializer.Normal(
mean=0.0, std=initializer_range)
self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
self.linear0 = nn.Linear(
d_model,
dim_feedforward,
weight_attr=paddle.ParamAttr(initializer=param_initializer),
bias_attr=None)
self.linear1 = nn.Linear(
dim_feedforward,
d_model,
weight_attr=paddle.ParamAttr(initializer=param_initializer),
bias_attr=None)
def forward(self, input):
out = self.norm(input)
auto.shard_tensor(
self.linear0.weight,
dist_attr={
"process_mesh": _g_process_mesh[0],
"dims_mapping": [-1, 0]
})
out = self.linear0(out)
out = F.gelu(out, approximate=True)
auto.shard_tensor(
self.linear1.weight,
dist_attr={
"process_mesh": _g_process_mesh[1],
"dims_mapping": [0, -1]
})
out = self.linear1(out)
return out
def loop_cond(i, loop_len, input_array):
return i < loop_len
def loop_body(i, loop_len, input_array):
pre_input = paddle.tensor.array_read(array=input_array, i=i)
mlp_while0 = MLPLayer(
hidden_size=hidden_size,
intermediate_size=4 * hidden_size,
dropout_ratio=0.1,
initializer_range=0.02)
mlp_while1 = MLPLayer(
hidden_size=hidden_size,
intermediate_size=4 * hidden_size,
dropout_ratio=0.1,
initializer_range=0.02)
output = mlp_while0(pre_input)
cur_pred = mlp_while1(output)
# 更新循环条件
i = paddle.increment(x=i, value=1)
paddle.tensor.array_write(cur_pred, array=input_array, i=i)
return i, loop_len, input_array
def get_program():
dist_strategy = fleet.DistributedStrategy()
dist_strategy.semi_auto = True
# fleet.init(is_collective=True, strategy=dist_strategy)
train_program = static.Program()
start_program = static.Program()
with static.program_guard(train_program, start_program):
# 循环计数器
i = paddle.full(shape=[1], fill_value=0, dtype='int64')
# 循环次数
loop_len = paddle.full(shape=[1], fill_value=epoch_num, dtype='int64')
# input
input = static.data(
name="input",
shape=[batch_size, sequence_len, hidden_size],
dtype='float32')
label = static.data(
name="label", shape=[batch_size, sequence_len, 1], dtype='float32')
data_holder = [input, label]
# dataloader
dataloader = paddle.io.DataLoader.from_generator(
feed_list=data_holder, capacity=4 * batch_size, iterable=False)
dataloader.set_batch_generator(
batch_generator_creator(), places=paddle.static.cuda_places())
# data dist_attr
auto.shard_tensor(
input,
dist_attr={
"process_mesh": _g_process_mesh[0],
"dims_mapping": [-1, -1, -1]
})
auto.shard_tensor(
label,
dist_attr={
"process_mesh": _g_process_mesh[0],
"dims_mapping": [-1, -1, -1]
})
mlp_start = MLPLayer(
hidden_size=hidden_size,
intermediate_size=4 * hidden_size,
dropout_ratio=0.1,
initializer_range=0.02)
pred = mlp_start(input)
input_array = paddle.tensor.array_write(pred, i)
i, loop_len, input_array = static.nn.while_loop(
cond=loop_cond,
body=loop_body,
loop_vars=[i, loop_len, input_array])
end_pred = paddle.tensor.array_read(array=input_array, i=i)
mlp_end = MLPLayer(
hidden_size=hidden_size,
intermediate_size=4 * hidden_size,
dropout_ratio=0.1,
initializer_range=0.02)
pred = mlp_end(end_pred)
error_cost = paddle.nn.functional.square_error_cost(pred, label)
loss = paddle.mean(error_cost)
return train_program, start_program, dataloader, i, loss
class TestMLP(unittest.TestCase):
def test_completer(self):
train_program, start_program, dataloader, i, loss = get_program()
dist_context = DistributedContext()
completer = Completer(dist_context)
complete_train_program = completer.complete_forward_annotation(
train_program)
# print_program_with_dist_attr(complete_train_program, dist_context)
if __name__ == "__main__":
unittest.main()
...@@ -123,17 +123,26 @@ class XPUOpTest(OpTest): ...@@ -123,17 +123,26 @@ class XPUOpTest(OpTest):
return super().check_grad_with_place( return super().check_grad_with_place(
place, inputs_to_check, output_names, no_grad_set, place, inputs_to_check, output_names, no_grad_set,
numeric_grad_delta, in_place, max_relative_error, numeric_grad_delta, in_place, max_relative_error,
user_defined_grads, user_defined_grads, check_dygraph) user_defined_grads, user_defined_grad_outputs, check_dygraph)
a1 = self.get_grad_with_place( a1 = self.get_grad_with_place(
place, inputs_to_check, output_names, no_grad_set=no_grad_set) place,
inputs_to_check,
output_names,
no_grad_set=no_grad_set,
user_defined_grad_outputs=user_defined_grad_outputs)
a2 = self.get_grad_with_place( a2 = self.get_grad_with_place(
place, inputs_to_check, output_names, no_grad_set=no_grad_set) place,
inputs_to_check,
output_names,
no_grad_set=no_grad_set,
user_defined_grad_outputs=user_defined_grad_outputs)
a3 = self.get_grad_with_place( a3 = self.get_grad_with_place(
paddle.CPUPlace(), paddle.CPUPlace(),
inputs_to_check, inputs_to_check,
output_names, output_names,
no_grad_set=no_grad_set) no_grad_set=no_grad_set,
user_defined_grad_outputs=user_defined_grad_outputs)
self._assert_is_close(a1, a2, inputs_to_check, 0.00000001, self._assert_is_close(a1, a2, inputs_to_check, 0.00000001,
"Gradient Check On two xpu") "Gradient Check On two xpu")
self._assert_is_close(a1, a3, inputs_to_check, max_relative_error, self._assert_is_close(a1, a3, inputs_to_check, max_relative_error,
...@@ -147,7 +156,7 @@ class XPUOpTest(OpTest): ...@@ -147,7 +156,7 @@ class XPUOpTest(OpTest):
numeric_grad_delta=0.005, numeric_grad_delta=0.005,
in_place=False, in_place=False,
max_relative_error=0.005, max_relative_error=0.005,
user_defined_grads=None, user_defined_grad_outputs=None,
check_dygraph=True): check_dygraph=True):
self.scope = core.Scope() self.scope = core.Scope()
op_inputs = self.inputs if hasattr(self, "inputs") else dict() op_inputs = self.inputs if hasattr(self, "inputs") else dict()
...@@ -197,6 +206,10 @@ class XPUOpTest(OpTest): ...@@ -197,6 +206,10 @@ class XPUOpTest(OpTest):
if not type(output_names) is list: if not type(output_names) is list:
output_names = [output_names] output_names = [output_names]
analytic_grads = self._get_gradient(inputs_to_check, place, analytic_grads = self._get_gradient(
output_names, no_grad_set) inputs_to_check,
place,
output_names,
no_grad_set,
user_defined_grad_outputs=user_defined_grad_outputs)
return analytic_grads return analytic_grads
...@@ -213,9 +213,9 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer): ...@@ -213,9 +213,9 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer):
set(parameters), set(parameters),
set([ set([
'fc_2.b_0', 'num_good_steps_0', 'fc_2.w_0', 'loss_scaling_0', 'fc_2.b_0', 'num_good_steps_0', 'fc_2.w_0', 'loss_scaling_0',
'num_bad_steps_0', 'fc_2.w_0_velocity_0', 'fc_2.w_0_asp_mask', 'num_bad_steps_0', 'fc_2.w_0_velocity_0', 'fc_2.w_0.asp_mask',
'learning_rate_0', 'fc_1.b_0', 'fc_1.w_0_asp_mask', 'learning_rate_0', 'fc_1.b_0', 'fc_1.w_0.asp_mask',
'fc_0.w_0_asp_mask', 'fc_1.b_0_velocity_0', 'fc_0.w_0.asp_mask', 'fc_1.b_0_velocity_0',
'fc_2.b_0_velocity_0' 'fc_2.b_0_velocity_0'
])) ]))
self.assertEqual(ops, [ self.assertEqual(ops, [
......
...@@ -333,6 +333,7 @@ class TestVariable(unittest.TestCase): ...@@ -333,6 +333,7 @@ class TestVariable(unittest.TestCase):
with self.assertRaises(IndexError): with self.assertRaises(IndexError):
res = x[[True, False, False]] res = x[[True, False, False]]
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
with paddle.static.program_guard(prog):
res = x[[False, False]] res = x[[False, False]]
def test_slice(self): def test_slice(self):
......
...@@ -59,16 +59,14 @@ class SGD(Optimizer): ...@@ -59,16 +59,14 @@ class SGD(Optimizer):
.. code-block:: python .. code-block:: python
import paddle import paddle
import numpy as np
inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") inp = paddle.uniform(min=-0.1, max=0.1, shape=[10, 10], dtype='float32')
linear = paddle.nn.Linear(10, 10) linear = paddle.nn.Linear(10, 10)
inp = paddle.to_tensor(inp) inp = paddle.to_tensor(inp)
out = linear(inp) out = linear(inp)
loss = paddle.mean(out) loss = paddle.mean(out)
beta1 = paddle.to_tensor([0.9], dtype="float32")
beta2 = paddle.to_tensor([0.99], dtype="float32")
sgd = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), weight_decay=0.01) sgd = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), weight_decay=0.01)
back = out.backward() out.backward()
sgd.step() sgd.step()
sgd.clear_grad() sgd.clear_grad()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册